1
2
3
4
5 """Dictionary like indexing of sequence files (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.index(...) function which is the
9 public interface for this functionality.
10
11 The basic idea is that we scan over a sequence file, looking for new record
12 markers. We then try and extract the string that Bio.SeqIO.parse/read would
13 use as the record id, ideally without actually parsing the full record. We
14 then use a subclassed Python dictionary to record the file offset for the
15 record start against the record id.
16
17 Note that this means full parsing is on demand, so any invalid or problem
18 record may not trigger an exception until it is accessed. This is by design.
19
20 This means our dictionary like objects have in memory ALL the keys (all the
21 record identifiers), which shouldn't be a problem even with second generation
22 sequencing. If this is an issue later on, storing the keys and offsets in a
23 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
24 """
25
26 import re
27 from Bio import SeqIO
28
30 """Read only dictionary interface to a sequential sequence file.
31
32 Keeps the keys in memory, reads the file to access entries as
33 SeqRecord objects using Bio.SeqIO for parsing them. This approach
34 is memory limited, but will work even with millions of sequences.
35
36 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys
37 (record identifiers by default) are not allowed. If this happens,
38 a ValueError exception is raised.
39
40 By default the SeqRecord's id string is used as the dictionary
41 key. This can be changed by suppling an optional key_function,
42 a callback function which will be given the record id and must
43 return the desired key. For example, this allows you to parse
44 NCBI style FASTA identifiers, and extract the GI number to use
45 as the dictionary key.
46
47 Note that this dictionary is essentially read only. You cannot
48 add or change values, pop values, nor clear the dictionary.
49 """
50 - def __init__(self, filename, alphabet, key_function, mode="rU") :
57
58
60 return "SeqIO.index(%s, %s, mode=%s, key_function=%s)" \
61 % (self._handle.name, self._format,
62 self._alphabet, self._key_function)
63
65 if self :
66 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0])
67 else :
68 return "{}"
69
71 """Used by subclasses to record file offsets for identifiers (PRIVATE).
72
73 This will apply the key_function (if given) to map the record id
74 string to the desired key.
75
76 This will raise a ValueError if a key (record id string) occurs
77 more than once.
78 """
79 if self._key_function :
80 key = self._key_function(identifier)
81 else :
82 key = identifier
83 if key in self :
84 raise ValueError("Duplicate key '%s'" % key)
85 else :
86 dict.__setitem__(self, key, seek_position)
87
89 """Would be a list of the SeqRecord objects, but not implemented.
90
91 In general you can be indexing very very large files, with millions
92 of sequences. Loading all these into memory at once as SeqRecord
93 objects would (probably) use up all the RAM. Therefore we simply
94 don't support this dictionary method.
95 """
96 raise NotImplementedError("Due to memory concerns, when indexing a "
97 "sequence file you cannot access all the "
98 "records at once.")
99
101 """Would be a list of the (key, SeqRecord) tuples, but not implemented.
102
103 In general you can be indexing very very large files, with millions
104 of sequences. Loading all these into memory at once as SeqRecord
105 objects would (probably) use up all the RAM. Therefore we simply
106 don't support this dictionary method.
107 """
108 raise NotImplementedError("Due to memory concerns, when indexing a "
109 "sequence file you cannot access all the "
110 "records at once.")
111
116
118 """x.__getitem__(y) <==> x[y]"""
119
120 handle = self._handle
121 handle.seek(dict.__getitem__(self, key))
122 record = SeqIO.parse(handle, self._format, self._alphabet).next()
123 if self._key_function :
124 assert self._key_function(record.id) == key, \
125 "Requested key %s, found record.id %s which has key %s" \
126 % (repr(key), repr(record.id),
127 repr(self._key_function(record.id)))
128 else :
129 assert record.id == key, \
130 "Requested key %s, found record.id %s" \
131 % (repr(key), repr(record.id))
132 return record
133
134 - def get(self, k, d=None) :
135 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
136 try :
137 return self.__getitem__(k)
138 except KeyError :
139 return d
140
142 """Would allow setting or replacing records, but not implemented."""
143 raise NotImplementedError("An indexed a sequence file is read only.")
144
146 """Would allow adding more values, but not implemented."""
147 raise NotImplementedError("An indexed a sequence file is read only.")
148
149
150 - def pop(self, key, default=None) :
151 """Would remove specified record, but not implemented."""
152 raise NotImplementedError("An indexed a sequence file is read only.")
153
155 """Would remove and return a SeqRecord, but not implemented."""
156 raise NotImplementedError("An indexed a sequence file is read only.")
157
158
160 """Would clear dictionary, but not implemented."""
161 raise NotImplementedError("An indexed a sequence file is read only.")
162
164 """A dictionary method which we don't implement."""
165 raise NotImplementedError("An indexed a sequence file doesn't "
166 "support this.")
167
169 """A dictionary method which we don't implement."""
170 raise NotImplementedError("An indexed a sequence file doesn't "
171 "support this.")
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
189 """Subclass for easy cases (PRIVATE)."""
190 - def __init__(self, filename, alphabet, key_function, format, marker) :
204
206 """Indexed dictionary like access to a FASTA file."""
207 - def __init__(self, filename, alphabet, key_function) :
210
211 -class QualDict(_SequentialSeqFileDict) :
212 """Indexed dictionary like access to a QUAL file."""
213 - def __init__(self, filename, alphabet, key_function) :
216
217 -class PirDict(_SequentialSeqFileDict) :
218 """Indexed dictionary like access to a PIR/NBRF file."""
219 - def __init__(self, filename, alphabet, key_function) :
222
223 -class PhdDict(_SequentialSeqFileDict) :
224 """Indexed dictionary like access to a PHD (PHRED) file."""
225 - def __init__(self, filename, alphabet, key_function) :
228
229 -class AceDict(_SequentialSeqFileDict) :
230 """Indexed dictionary like access to an ACE file."""
231 - def __init__(self, filename, alphabet, key_function) :
234
235
236
237
238
239
241 """Indexed dictionary like access to a GenBank file."""
242 - def __init__(self, filename, alphabet, key_function) :
277
279 """Indexed dictionary like access to an EMBL file."""
280 - def __init__(self, filename, alphabet, key_function) :
311
313 """Indexed dictionary like access to a SwissProt file."""
314 - def __init__(self, filename, alphabet, key_function) :
330
332 """Indexed dictionary like access to a IntelliGenetics file."""
333 - def __init__(self, filename, alphabet, key_function) :
352
353 -class TabDict(_IndexedSeqFileDict) :
354 """Indexed dictionary like access to a simple tabbed file."""
355 - def __init__(self, filename, alphabet, key_function) :
373
374
375
376
377
379 """Subclass for easy cases (PRIVATE).
380
381 With FASTQ the records all start with a "@" line, but so too can some
382 quality lines. Note this will cope with line-wrapped FASTQ files.
383 """
384 - def __init__(self, filename, alphabet, key_function, fastq_format) :
385 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function)
386 self._format = fastq_format
387 handle = self._handle
388 pos = handle.tell()
389 line = handle.readline()
390 if not line :
391
392 return
393 if line[0] != "@" :
394 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
395 while line :
396
397
398 self._record_key(line[1:].rstrip().split(None,1)[0],pos)
399
400 seq_len = 0
401 while line :
402 line = handle.readline()
403 if line.startswith("+") : break
404 seq_len += len(line.strip())
405 if not line :
406 raise ValueError("Premature end of file in seq section")
407
408
409 qual_len = 0
410 while line :
411 if seq_len == qual_len :
412
413 pos = handle.tell()
414 line = handle.readline()
415 if line and line[0]!="@" :
416 ValueError("Problem with line %s" % repr(line))
417 break
418 else :
419 line = handle.readline()
420 qual_len += len(line.strip())
421 if seq_len != qual_len :
422 raise ValueError("Problem with quality section")
423
424
426 """Indexed dictionary like access to a standard Sanger FASTQ file."""
427 - def __init__(self, filename, alphabet, key_function) :
430
432 """Indexed dictionary like access to a Solexa (or early Illumina) FASTQ file."""
433 - def __init__(self, filename, alphabet, key_function) :
436
438 """Indexed dictionary like access to a Illumina 1.3+ FASTQ file."""
439 - def __init__(self, filename, alphabet, key_function) :
442
443
444
445 _FormatToIndexedDict = {"ace" : AceDict,
446 "embl" : EmblDict,
447 "fasta" : FastaDict,
448 "fastq" : FastqSangerDict,
449 "fastq-sanger" : FastqSangerDict,
450 "fastq-solexa" : FastqSolexaDict,
451 "fastq-illumina" : FastqIlluminaDict,
452 "genbank" : GenBankDict,
453 "gb" : GenBankDict,
454 "ig" : IntelliGeneticsDict,
455 "phd" : PhdDict,
456 "pir" : PirDict,
457 "swiss" : SwissDict,
458 "tab" : TabDict,
459 "qual" : QualDict
460 }
461