1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 ErrorFeatureParser Catch errors caused during parsing.
25 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
26 RecordParser Parse GenBank data into a Record object.
27 NCBIDictionary Access GenBank using a dictionary interface (DEPRECATED).
28
29 _BaseGenBankConsumer A base class for GenBank consumer that implements
30 some helpful functions that are in common between
31 consumers.
32 _FeatureConsumer Create SeqFeature objects from info generated by
33 the Scanner
34 _RecordConsumer Create a GenBank record object from Scanner info.
35 _PrintingConsumer A debugging consumer.
36
37 ParserFailureError Exception indicating a failure in the parser (ie.
38 scanner or consumer)
39 LocationParserError Exception indiciating a problem with the spark based
40 location parser.
41
42 Functions:
43 search_for Do a query against GenBank (DEPRECATED).
44 download_many Download many GenBank records (DEPRECATED).
45
46 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
47 These are GenBank files that summarize the content of a project, and provide lists of
48 scaffold and contig files in the project. These will be in annotations['wgs'] and
49 annotations['wgs_scafld']. These GenBank files do not have sequences. See
50 http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
51
52 http://is.gd/nNgk
53 for more details of this format, and an example.
54 Added by Ying Huang & Iddo Friedberg
55 """
56 import cStringIO
57
58
59 from Bio import SeqFeature
60 from Bio.ParserSupport import AbstractConsumer
61 from Bio import Entrez
62
63
64 import LocationParser
65 from utils import FeatureValueCleaner
66 from Scanner import GenBankScanner
67
68
69 GENBANK_INDENT = 12
70 GENBANK_SPACER = " " * GENBANK_INDENT
71
72
73 FEATURE_KEY_INDENT = 5
74 FEATURE_QUALIFIER_INDENT = 21
75 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
76 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
77
79 """Iterator interface to move over a file of GenBank entries one at a time.
80 """
81 - def __init__(self, handle, parser = None):
82 """Initialize the iterator.
83
84 Arguments:
85 o handle - A handle with GenBank entries to iterate through.
86 o parser - An optional parser to pass the entries through before
87 returning them. If None, then the raw entry will be returned.
88 """
89 self.handle = handle
90 self._parser = parser
91
93 """Return the next GenBank record from the handle.
94
95 Will return None if we ran out of records.
96 """
97 if self._parser is None :
98 lines = []
99 while True :
100 line = self.handle.readline()
101 if not line : return None
102 lines.append(line)
103 if line.rstrip() == "//" : break
104 return "".join(lines)
105 try :
106 return self._parser.parse(self.handle)
107 except StopIteration :
108 return None
109
111 return iter(self.next, None)
112
114 """Failure caused by some kind of problem in the parser.
115 """
116 pass
117
119 """Could not Properly parse out a location from a GenBank file.
120 """
121 pass
122
124 """Parse GenBank files into Seq + Feature objects.
125 """
128 """Initialize a GenBank parser and Feature consumer.
129
130 Arguments:
131 o debug_level - An optional argument that species the amount of
132 debugging information the parser should spit out. By default we have
133 no debugging info (the fastest way to do things), but if you want
134 you can set this as high as two and see exactly where a parse fails.
135 o use_fuzziness - Specify whether or not to use fuzzy representations.
136 The default is 1 (use fuzziness).
137 o feature_cleaner - A class which will be used to clean out the
138 values of features. This class must implement the function
139 clean_value. GenBank.utils has a "standard" cleaner class, which
140 is used by default.
141 """
142 self._scanner = GenBankScanner(debug_level)
143 self.use_fuzziness = use_fuzziness
144 self._cleaner = feature_cleaner
145
146 - def parse(self, handle):
147 """Parse the specified handle.
148 """
149 self._consumer = _FeatureConsumer(self.use_fuzziness,
150 self._cleaner)
151 self._scanner.feed(handle, self._consumer)
152 return self._consumer.data
153
155 """Parse GenBank files into Record objects
156 """
158 """Initialize the parser.
159
160 Arguments:
161 o debug_level - An optional argument that species the amount of
162 debugging information the parser should spit out. By default we have
163 no debugging info (the fastest way to do things), but if you want
164 you can set this as high as two and see exactly where a parse fails.
165 """
166 self._scanner = GenBankScanner(debug_level)
167
168 - def parse(self, handle):
169 """Parse the specified handle into a GenBank record.
170 """
171 self._consumer = _RecordConsumer()
172 self._scanner.feed(handle, self._consumer)
173 return self._consumer.data
174
176 """Abstract GenBank consumer providing useful general functions.
177
178 This just helps to eliminate some duplication in things that most
179 GenBank consumers want to do.
180 """
181
182
183
184
185 remove_space_keys = ["translation"]
186
189
191 """Split a string of keywords into a nice clean list.
192 """
193
194 if keyword_string == "" or keyword_string == "." :
195 keywords = ""
196 elif keyword_string[-1] == '.':
197 keywords = keyword_string[:-1]
198 else:
199 keywords = keyword_string
200 keyword_list = keywords.split(';')
201 clean_keyword_list = [x.strip() for x in keyword_list]
202 return clean_keyword_list
203
205 """Split a string of accession numbers into a list.
206 """
207
208
209 accession = accession_string.replace("\n", " ").replace(";"," ")
210
211 return [x.strip() for x in accession.split() if x.strip()]
212
214 """Split a string with taxonomy info into a list.
215 """
216 if not taxonomy_string or taxonomy_string=="." :
217
218 return []
219
220 if taxonomy_string[-1] == '.':
221 tax_info = taxonomy_string[:-1]
222 else:
223 tax_info = taxonomy_string
224 tax_list = tax_info.split(';')
225 new_tax_list = []
226 for tax_item in tax_list:
227 new_items = tax_item.split("\n")
228 new_tax_list.extend(new_items)
229 while '' in new_tax_list:
230 new_tax_list.remove('')
231 clean_tax_list = [x.strip() for x in new_tax_list]
232
233 return clean_tax_list
234
236 """Clean whitespace out of a location string.
237
238 The location parser isn't a fan of whitespace, so we clean it out
239 before feeding it into the parser.
240 """
241
242
243
244 return ''.join(location_string.split())
245
247 """Remove any newlines in the passed text, returning the new string.
248 """
249
250 newlines = ["\n", "\r"]
251 for ws in newlines:
252 text = text.replace(ws, "")
253
254 return text
255
257 """Replace multiple spaces in the passed text with single spaces.
258 """
259
260 text_parts = text.split(" ")
261 text_parts = filter(None, text_parts)
262 return ' '.join(text_parts)
263
265 """Remove all spaces from the passed text.
266 """
267 return text.replace(" ", "")
268
270 """Convert a start and end range to python notation.
271
272 In GenBank, starts and ends are defined in "biological" coordinates,
273 where 1 is the first base and [i, j] means to include both i and j.
274
275 In python, 0 is the first base and [i, j] means to include i, but
276 not j.
277
278 So, to convert "biological" to python coordinates, we need to
279 subtract 1 from the start, and leave the end and things should
280 be converted happily.
281 """
282 new_start = start - 1
283 new_end = end
284
285 return new_start, new_end
286
288 """Create a SeqRecord object with Features to return.
289
290 Attributes:
291 o use_fuzziness - specify whether or not to parse with fuzziness in
292 feature locations.
293 o feature_cleaner - a class that will be used to provide specialized
294 cleaning-up of feature values.
295 """
296 - def __init__(self, use_fuzziness, feature_cleaner = None):
297 from Bio.SeqRecord import SeqRecord
298 _BaseGenBankConsumer.__init__(self)
299 self.data = SeqRecord(None, id = None)
300 self.data.id = None
301 self.data.description = ""
302
303 self._use_fuzziness = use_fuzziness
304 self._feature_cleaner = feature_cleaner
305
306 self._seq_type = ''
307 self._seq_data = []
308 self._current_ref = None
309 self._cur_feature = None
310 self._cur_qualifier_key = None
311 self._cur_qualifier_value = None
312 self._expected_size = None
313
314 - def locus(self, locus_name):
315 """Set the locus name is set as the name of the Sequence.
316 """
317 self.data.name = locus_name
318
319 - def size(self, content):
320 """Record the sequence length."""
321 self._expected_size = int(content)
322
324 """Record the sequence type so we can choose an appropriate alphabet.
325 """
326 self._seq_type = type
327
330
331 - def date(self, submit_date):
333
343
345 """Set the accession number as the id of the sequence.
346
347 If we have multiple accession numbers, the first one passed is
348 used.
349 """
350 new_acc_nums = self._split_accessions(acc_num)
351
352
353 try :
354
355 for acc in new_acc_nums :
356
357 if acc not in self.data.annotations['accessions'] :
358 self.data.annotations['accessions'].append(acc)
359 except KeyError :
360 self.data.annotations['accessions'] = new_acc_nums
361
362
363 if self.data.id is None:
364 if len(new_acc_nums) > 0:
365
366
367 self.data.id = self.data.annotations['accessions'][0]
368
369 - def wgs(self, content):
371
374
375 - def nid(self, content):
377
378 - def pid(self, content):
380
393
395 """Handle the information from the PROJECT line as a list of projects.
396
397 e.g.
398 PROJECT GenomeProject:28471
399
400 or:
401 PROJECT GenomeProject:13543 GenomeProject:99999
402
403 This is stored as dbxrefs in the SeqRecord to be consistent with the
404 projected switch of this line to DBLINK in future GenBank versions.
405 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
406 "Project:28471" as part of this transition.
407 """
408 content = content.replace("GenomeProject:", "Project:")
409 self.data.dbxrefs.extend([p for p in content.split() if p])
410
412 """Store DBLINK cross references as dbxrefs in our record object.
413
414 This line type is expected to replace the PROJECT line in 2009. e.g.
415
416 During transition:
417
418 PROJECT GenomeProject:28471
419 DBLINK Project:28471
420 Trace Assembly Archive:123456
421
422 Once the project line is dropped:
423
424 DBLINK Project:28471
425 Trace Assembly Archive:123456
426
427 Note GenomeProject -> Project.
428
429 We'll have to see some real examples to be sure, but based on the
430 above example we can expect one reference per line.
431 """
432
433
434 if content.strip() not in self.data.dbxrefs :
435 self.data.dbxrefs.append(content.strip())
436
438 """Set the version to overwrite the id.
439
440 Since the verison provides the same information as the accession
441 number, plus some extra info, we set this as the id if we have
442 a version.
443 """
444
445
446
447
448
449
450
451
452
453
454 assert version.isdigit()
455 self.data.annotations['sequence_version'] = int(version)
456
459
460 - def gi(self, content):
462
465
468
470
471
472 if content == "" :
473 source_info = ""
474 elif content[-1] == '.':
475 source_info = content[:-1]
476 else:
477 source_info = content
478 self.data.annotations['source'] = source_info
479
482
491
493 """Signal the beginning of a new reference object.
494 """
495
496
497 if self._current_ref is not None:
498 self.data.annotations['references'].append(self._current_ref)
499 else:
500 self.data.annotations['references'] = []
501
502 self._current_ref = SeqFeature.Reference()
503
505 """Attempt to determine the sequence region the reference entails.
506
507 Possible types of information we may have to deal with:
508
509 (bases 1 to 86436)
510 (sites)
511 (bases 1 to 105654; 110423 to 111122)
512 1 (residues 1 to 182)
513 """
514
515 ref_base_info = content[1:-1]
516
517 all_locations = []
518
519 if ref_base_info.find('bases') != -1 and \
520 ref_base_info.find('to') != -1:
521
522 ref_base_info = ref_base_info[5:]
523 locations = self._split_reference_locations(ref_base_info)
524 all_locations.extend(locations)
525 elif (ref_base_info.find("residues") >= 0 and
526 ref_base_info.find("to") >= 0):
527 residues_start = ref_base_info.find("residues")
528
529 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
530 locations = self._split_reference_locations(ref_base_info)
531 all_locations.extend(locations)
532
533
534
535 elif (ref_base_info == 'sites' or
536 ref_base_info.strip() == 'bases'):
537 pass
538
539 else:
540 raise ValueError("Could not parse base info %s in record %s" %
541 (ref_base_info, self.data.id))
542
543 self._current_ref.location = all_locations
544
546 """Get reference locations out of a string of reference information
547
548 The passed string should be of the form:
549
550 1 to 20; 20 to 100
551
552 This splits the information out and returns a list of location objects
553 based on the reference locations.
554 """
555
556 all_base_info = location_string.split(';')
557
558 new_locations = []
559 for base_info in all_base_info:
560 start, end = base_info.split('to')
561 new_start, new_end = \
562 self._convert_to_python_numbers(int(start.strip()),
563 int(end.strip()))
564 this_location = SeqFeature.FeatureLocation(new_start, new_end)
565 new_locations.append(this_location)
566 return new_locations
567
569 if self._current_ref.authors :
570 self._current_ref.authors += ' ' + content
571 else :
572 self._current_ref.authors = content
573
575 if self._current_ref.consrtm :
576 self._current_ref.consrtm += ' ' + content
577 else :
578 self._current_ref.consrtm = content
579
580 - def title(self, content):
581 if self._current_ref.title :
582 self._current_ref.title += ' ' + content
583 else :
584 self._current_ref.title = content
585
587 if self._current_ref.journal :
588 self._current_ref.journal += ' ' + content
589 else :
590 self._current_ref.journal = content
591
594
597
599 """Deal with a reference comment."""
600 if self._current_ref.comment :
601 self._current_ref.comment += ' ' + content
602 else :
603 self._current_ref.comment = content
604
610
612 """Get ready for the feature table when we reach the FEATURE line.
613 """
614 self.start_feature_table()
615
617 """Indicate we've got to the start of the feature table.
618 """
619
620 if self._current_ref is not None:
621 self.data.annotations['references'].append(self._current_ref)
622 self._current_ref = None
623
625 """Utility function to add a feature to the SeqRecord.
626
627 This does all of the appropriate checking to make sure we haven't
628 left any info behind, and that we are only adding info if it
629 exists.
630 """
631 if self._cur_feature:
632
633
634 self._add_qualifier()
635
636 self._cur_qualifier_key = ''
637 self._cur_qualifier_value = ''
638 self.data.features.append(self._cur_feature)
639
653
655 """Parse out location information from the location string.
656
657 This uses a comprehensive but slow spark based parser to do the
658 parsing, and then translates the results of the parse into appropriate
659 Location objects.
660 """
661
662
663
664
665
666
667 location_line = self._clean_location(content)
668
669
670
671
672
673
674 if location_line.find('replace') != -1:
675 comma_pos = location_line.find(',')
676 location_line = location_line[8:comma_pos]
677
678
679 try:
680 parse_info = \
681 LocationParser.parse(LocationParser.scan(location_line))
682
683 except SystemExit:
684 raise LocationParserError(location_line)
685
686
687
688
689 self._set_location_info(parse_info, self._cur_feature)
690
692 """Set the location information based on a function.
693
694 This handles all of the location functions like 'join', 'complement'
695 and 'order'.
696
697 Arguments:
698 o function - A LocationParser.Function object specifying the
699 function we are acting on.
700 o cur_feature - The feature to add information to.
701 """
702 assert isinstance(function, LocationParser.Function), \
703 "Expected a Function object, got %s" % function
704
705 if function.name == "complement":
706
707 cur_feature.strand = -1
708
709 for inner_info in function.args:
710 self._set_location_info(inner_info, cur_feature)
711
712
713
714
715
716
717
718 elif (function.name == "join" or function.name == "order" or
719 function.name == "one-of" or function.name == "bond"):
720 self._set_ordering_info(function, cur_feature)
721 elif (function.name == "gap"):
722 assert len(function.args) == 1, \
723 "Unexpected number of arguments in gap %s" % function.args
724
725 position = self._get_position(function.args[0].local_location)
726 cur_feature.location = SeqFeature.PositionGap(position)
727 else:
728 raise ValueError("Unexpected function name: %s" % function.name)
729
731 """Parse a join or order and all of the information in it.
732
733 This deals with functions that order a bunch of locations,
734 specifically 'join' and 'order'. The inner locations are
735 added as subfeatures of the top level feature
736 """
737
738
739 cur_feature.location_operator = function.name
740 for inner_element in function.args:
741 new_sub_feature = SeqFeature.SeqFeature()
742
743 new_sub_feature.type = cur_feature.type
744
745 new_sub_feature.location_operator = function.name
746
747 new_sub_feature.ref = cur_feature.ref
748 new_sub_feature.ref_db = cur_feature.ref_db
749 new_sub_feature.strand = cur_feature.strand
750
751
752 self._set_location_info(inner_element, new_sub_feature)
753
754
755 cur_feature.sub_features.append(new_sub_feature)
756
757
758
759
760
761
762
763
764 feature_start = cur_feature.sub_features[0].location.start
765 feature_end = cur_feature.sub_features[-1].location.end
766 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
767 feature_end)
768
769
770
771
772
773
774 strands = set(sf.strand for sf in cur_feature.sub_features)
775 if len(strands)==1 :
776 cur_feature.strand = cur_feature.sub_features[0].strand
777 else :
778 cur_feature.strand = None
779
781 """Set the location information for a feature from the parse info.
782
783 Arguments:
784 o parse_info - The classes generated by the LocationParser.
785 o cur_feature - The feature to add the information to.
786 """
787
788 if parse_info is None:
789 return
790
791
792 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
793 self._set_location(parse_info, cur_feature)
794 return
795
796 elif isinstance(parse_info, LocationParser.Function):
797 self._set_function(parse_info, cur_feature)
798
799 else:
800 raise ValueError("Could not parse location info: %s"
801 % parse_info)
802
804 """Set the location information for a feature.
805
806 Arguments:
807 o location - An AbsoluteLocation object specifying the info
808 about the location.
809 o cur_feature - The feature to add the information to.
810 """
811
812
813 if location.path is not None:
814 cur_feature.ref = location.path.accession
815 cur_feature.ref_db = location.path.database
816
817 cur_feature.location = self._get_location(location.local_location)
818
820 """Return a (possibly fuzzy) location from a Range object.
821
822 Arguments:
823 o range_info - A location range (ie. something like 67..100). This
824 may also be a single position (ie 27).
825
826 This returns a FeatureLocation object.
827 If parser.use_fuzziness is set at one, the positions for the
828 end points will possibly be fuzzy.
829 """
830 if isinstance(range_info, LocationParser.Between) \
831 and range_info.low.val+1 == range_info.high.val:
832
833
834
835 pos = self._get_position(range_info.low)
836 return SeqFeature.FeatureLocation(pos, pos)
837
838
839
840 elif not(isinstance(range_info, LocationParser.Range)):
841
842 s_pos = self._get_position(range_info)
843
844
845 s_pos.position = s_pos.position - 1
846 e_pos = self._get_position(range_info)
847 return SeqFeature.FeatureLocation(s_pos, e_pos)
848
849 else:
850
851 start_pos = self._get_position(range_info.low)
852 end_pos = self._get_position(range_info.high)
853
854 start_pos.position, end_pos.position = \
855 self._convert_to_python_numbers(start_pos.position,
856 end_pos.position)
857
858
859 if isinstance(start_pos, SeqFeature.OneOfPosition) :
860 for p in start_pos.position_choices :
861 p.position -= 1
862
863 return SeqFeature.FeatureLocation(start_pos, end_pos)
864
866 """Return a (possibly fuzzy) position for a single coordinate.
867
868 Arguments:
869 o position - This is a LocationParser.* object that specifies
870 a single coordinate. We will examine the object to determine
871 the fuzziness of the position.
872
873 This is used with _get_location to parse out a location of any
874 end_point of arbitrary fuzziness.
875 """
876
877 if (isinstance(position, LocationParser.Integer)):
878 final_pos = SeqFeature.ExactPosition(position.val)
879
880 elif isinstance(position, LocationParser.LowBound):
881 final_pos = SeqFeature.AfterPosition(position.base.val)
882
883 elif isinstance(position, LocationParser.HighBound):
884 final_pos = SeqFeature.BeforePosition(position.base.val)
885
886
887 elif isinstance(position, LocationParser.Between):
888
889
890
891
892
893
894 final_pos = SeqFeature.BetweenPosition(position.low.val,
895 position.high.val-position.low.val)
896
897 elif isinstance(position, LocationParser.TwoBound):
898 final_pos = SeqFeature.WithinPosition(position.low.val,
899 position.high.val-position.low.val)
900
901 elif isinstance(position, LocationParser.Function) and \
902 position.name == "one-of":
903
904 position_choices = []
905 for arg in position.args:
906
907
908 assert isinstance(arg, LocationParser.AbsoluteLocation), \
909 "Unhandled Location type %r" % arg
910 assert arg.path is None, "Unhandled path in location"
911 position = self._get_position(arg.local_location)
912 position_choices.append(position)
913 final_pos = SeqFeature.OneOfPosition(position_choices)
914
915 else:
916 raise ValueError("Unexpected LocationParser object %r" %
917 position)
918
919
920 if self._use_fuzziness:
921 return final_pos
922
923 else:
924 return SeqFeature.ExactPosition(final_pos.location)
925
927 """Add a qualifier to the current feature without loss of info.
928
929 If there are multiple qualifier keys with the same name we
930 would lose some info in the dictionary, so we append a unique
931 number to the end of the name in case of conflicts.
932 """
933
934
935 if self._cur_qualifier_key:
936 key = self._cur_qualifier_key
937 value = "".join(self._cur_qualifier_value)
938 if self._feature_cleaner is not None:
939 value = self._feature_cleaner.clean_value(key, value)
940
941 if key in self._cur_feature.qualifiers:
942 self._cur_feature.qualifiers[key].append(value)
943
944 else:
945 self._cur_feature.qualifiers[key] = [value]
946
948 """When we get a qualifier key, use it as a dictionary key.
949
950 We receive a list of keys, since you can have valueless keys such as
951 /pseudo which would be passed in with the next key (since no other
952 tags separate them in the file)
953 """
954 for content in content_list:
955
956 self._add_qualifier()
957
958
959 qual_key = content.replace('/', '')
960 qual_key = qual_key.replace('=', '')
961 qual_key = qual_key.strip()
962
963 self._cur_qualifier_key = qual_key
964 self._cur_qualifier_value = []
965
967
968 qual_value = content.replace('"', '')
969
970 self._cur_qualifier_value.append(qual_value)
971
973 """Deal with CONTIG information.
974
975 Most CONTIG descriptions use a join of other externally referenced
976 sequences. Currently this code tries to use the location parser,
977 and represent this as a SeqFeature with sub-features.
978 """
979
980
981 self._add_feature()
982
983 self._cur_feature = SeqFeature.SeqFeature()
984 self._cur_feature.type = "contig"
985
986
987 self.location(content)
988
989
990 self.data.annotations["contig"] = self._cur_feature
991 self._cur_feature = None
992
995
998
1001
1003 """Add up sequence information as we get it.
1004
1005 To try and make things speedier, this puts all of the strings
1006 into a list of strings, and then uses string.join later to put
1007 them together. Supposedly, this is a big time savings
1008 """
1009 new_seq = content.replace(' ', '')
1010 new_seq = new_seq.upper()
1011
1012 self._seq_data.append(new_seq)
1013
1015 """Clean up when we've finished the record.
1016 """
1017 from Bio import Alphabet
1018 from Bio.Alphabet import IUPAC
1019 from Bio.Seq import Seq, UnknownSeq
1020
1021
1022 if self.data.id is None :
1023 assert 'accessions' not in self.data.annotations, \
1024 self.data.annotations['accessions']
1025 self.data.id = self.data.name
1026 elif self.data.id.count('.') == 0 :
1027 try :
1028 self.data.id+='.%i' % self.data.annotations['sequence_version']
1029 except KeyError :
1030 pass
1031
1032
1033 self._add_feature()
1034
1035
1036
1037
1038
1039 seq_alphabet = Alphabet.generic_alphabet
1040
1041
1042 sequence = "".join(self._seq_data)
1043
1044 if self._expected_size is not None \
1045 and len(sequence) != 0 \
1046 and self._expected_size != len(sequence) :
1047 raise ValueError("Expected sequence length %i, found %i." \
1048 % (self._expected_size, len(sequence)))
1049
1050 if self._seq_type:
1051
1052 if self._seq_type.find('DNA') != -1 or \
1053 self._seq_type.find('mRNA') != -1:
1054 seq_alphabet = IUPAC.ambiguous_dna
1055
1056 elif self._seq_type.find('RNA') != -1:
1057
1058
1059 if "T" in sequence and "U" not in sequence:
1060 seq_alphabet = IUPAC.ambiguous_dna
1061 else :
1062 seq_alphabet = IUPAC.ambiguous_rna
1063 elif self._seq_type.find('PROTEIN') != -1 :
1064 seq_alphabet = IUPAC.protein
1065
1066
1067 elif self._seq_type in ["circular", "linear"]:
1068 pass
1069
1070 else:
1071 raise ValueError("Could not determine alphabet for seq_type %s"
1072 % self._seq_type)
1073
1074 if not sequence and self.__expected_size :
1075 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet)
1076 else :
1077 self.data.seq = Seq(sequence, seq_alphabet)
1078
1080 """Create a GenBank Record object from scanner generated information.
1081 """
1091
1092 - def wgs(self, content):
1094
1097
1098 - def locus(self, content):
1100
1101 - def size(self, content):
1103
1106
1109
1110 - def date(self, content):
1112
1115
1120
1121 - def nid(self, content):
1123
1124 - def pid(self, content):
1126
1129
1132
1133 - def gi(self, content):
1135
1138
1141
1144
1147
1150
1153
1156
1158 """Grab the reference number and signal the start of a new reference.
1159 """
1160
1161 if self._cur_reference is not None:
1162 self.data.references.append(self._cur_reference)
1163
1164 self._cur_reference = Record.Reference()
1165 self._cur_reference.number = content
1166
1168 self._cur_reference.bases = content
1169
1171 self._cur_reference.authors = content
1172
1174 self._cur_reference.consrtm = content
1175
1176 - def title(self, content):
1177 self._cur_reference.title = content
1178
1180 self._cur_reference.journal = content
1181
1184
1187
1189 self._cur_reference.remark = content
1190
1193
1197
1200
1202 """Get ready for the feature table when we reach the FEATURE line.
1203 """
1204 self.start_feature_table()
1205
1207 """Signal the start of the feature table.
1208 """
1209
1210 if self._cur_reference is not None:
1211 self.data.references.append(self._cur_reference)
1212
1214 """Grab the key of the feature and signal the start of a new feature.
1215 """
1216
1217 self._add_feature()
1218
1219 self._cur_feature = Record.Feature()
1220 self._cur_feature.key = content
1221
1223 """Utility function to add a feature to the Record.
1224
1225 This does all of the appropriate checking to make sure we haven't
1226 left any info behind, and that we are only adding info if it
1227 exists.
1228 """
1229 if self._cur_feature is not None:
1230
1231
1232 if self._cur_qualifier is not None:
1233 self._cur_feature.qualifiers.append(self._cur_qualifier)
1234
1235 self._cur_qualifier = None
1236 self.data.features.append(self._cur_feature)
1237
1240
1242 """Deal with qualifier names
1243
1244 We receive a list of keys, since you can have valueless keys such as
1245 /pseudo which would be passed in with the next key (since no other
1246 tags separate them in the file)
1247 """
1248 for content in content_list:
1249
1250 if content.find("/") != 0:
1251 content = "/%s" % content
1252
1253 if self._cur_qualifier is not None:
1254 self._cur_feature.qualifiers.append(self._cur_qualifier)
1255
1256 self._cur_qualifier = Record.Qualifier()
1257 self._cur_qualifier.key = content
1258
1270
1272 self.data.base_counts = content
1273
1275 self.data.origin = content
1276
1278 """Signal that we have contig information to add to the record.
1279 """
1280 self.data.contig = self._clean_location(content)
1281
1283 """Add sequence information to a list of sequence strings.
1284
1285 This removes spaces in the data and uppercases the sequence, and
1286 then adds it to a list of sequences. Later on we'll join this
1287 list together to make the final sequence. This is faster than
1288 adding on the new string every time.
1289 """
1290 new_seq = content.replace(' ', '')
1291 self._seq_data.append(new_seq.upper())
1292
1294 """Signal the end of the record and do any necessary clean-up.
1295 """
1296
1297
1298 self.data.sequence = "".join(self._seq_data)
1299
1300 self._add_feature()
1301
1302
1304 """Access GenBank using a read-only dictionary interface (DEPRECATED).
1305
1306 This object is deprecated and will be removed in a future release of
1307 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1308 """
1309 VALID_DATABASES = ['nucleotide', 'protein', 'genome']
1310 VALID_FORMATS = ['genbank', 'fasta']
1311 - def __init__(self, database, format, parser = None):
1312 """Initialize an NCBI dictionary to retrieve sequences.
1313
1314 Create a new Dictionary to access GenBank. Valid values for
1315 database are 'nucleotide' and 'protein'.
1316 Valid values for format are 'genbank' (for nucleotide genbank and
1317 protein genpept) and 'fasta'.
1318 dely and retmax are old options kept only for compatibility -- do not
1319 bother to set them.
1320 parser is an optional parser object
1321 to change the results into another form. If unspecified, then
1322 the raw contents of the file will be returned.
1323 """
1324 import warnings
1325 warnings.warn("Bio.GenBank.NCBIDictionary has been deprecated, and will be"\
1326 " removed in a future release of Biopython. Please use"\
1327 " Bio.Entrez instead which is described in the tutorial.",
1328 DeprecationWarning)
1329
1330 self.parser = parser
1331 if database not in self.__class__.VALID_DATABASES:
1332 raise ValueError("Invalid database %s, should be one of %s" %
1333 (database, self.__class__.VALID_DATABASES))
1334 if format not in self.__class__.VALID_FORMATS:
1335 raise ValueError("Invalid format %s, should be one of %s" %
1336 (format, self.__class__.VALID_FORMATS))
1337
1338 if format=="genbank": format = "gb"
1339 self.db = database
1340 self.format = format
1341
1343 raise NotImplementedError("GenBank contains lots of entries")
1345 raise NotImplementedError("This is a read-only dictionary")
1347 raise NotImplementedError("This is a read-only dictionary")
1349 raise NotImplementedError("This is a read-only dictionary")
1351 raise NotImplementedError("You don't need to do this...")
1353 raise NotImplementedError("You don't really want to do this...")
1355 raise NotImplementedError("You don't really want to do this...")
1357 raise NotImplementedError("You don't really want to do this...")
1358
1360 """S.has_key(id) -> bool"""
1361 try:
1362 self[id]
1363 except KeyError:
1364 return 0
1365 return 1
1366
1367 - def get(self, id, failobj=None):
1368 try:
1369 return self[id]
1370 except KeyError:
1371 return failobj
1372
1374 """Return the GenBank entry specified by the GenBank ID.
1375
1376 Raises a KeyError if there's an error.
1377 """
1378 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format)
1379
1380 if self.parser is not None:
1381 return self.parser.parse(handle)
1382 return handle.read()
1383
1384 -def search_for(search, database='nucleotide',
1385 reldate=None, mindate=None, maxdate=None,
1386 start_id = 0, max_ids = 50000000):
1387 """Do an online search at the NCBI, returns a list of IDs (DEPRECATED).
1388
1389 This function is deprecated and will be removed in a future release of
1390 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1391
1392 Search GenBank and return a list of the GenBank identifiers (gi's)
1393 that match the criteria. search is the search string used to
1394 search the database. Valid values for database are
1395 'nucleotide', 'protein', 'popset' and 'genome'. reldate is
1396 the number of dates prior to the current date to restrict the
1397 search. mindate and maxdate are the dates to restrict the search,
1398 e.g. 2002/12/20. start_id is the number to begin retrieval on.
1399 max_ids specifies the maximum number of id's to retrieve.
1400 """
1401 import warnings
1402 warnings.warn("Bio.GenBank.search_for has been deprecated, and will be"\
1403 " removed in a future release of Biopython. Please use"\
1404 " Bio.Entrez instead which is described in the tutorial.",
1405 DeprecationWarning)
1406
1407
1408
1409
1410
1411
1412 import re
1413 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match
1414 errinfo = None
1415 if mindate is not None and _date_re_match(mindate) is None:
1416 errinfo = ("mindate", mindate)
1417 elif maxdate is not None and _date_re_match(maxdate) is None:
1418 errinfo = ("maxdate", maxdate)
1419 if errinfo:
1420 raise TypeError(
1421 "%s is not in YYYY/MM/DD format (month and "
1422 "day are optional): %r" % errinfo)
1423
1424
1425 handle = Entrez.esearch(database, search, retmode="xml",
1426 retstart=start_id, retmax=max_ids,
1427 mindate=mindate, maxdate=maxdate,
1428 reldate=reldate)
1429 return Entrez.read(handle)["IdList"]
1430
1432 """Download multiple NCBI GenBank records, returned as a handle (DEPRECATED).
1433
1434 This function is deprecated and will be removed in a future release of
1435 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1436
1437 Download many records from GenBank. ids is a list of gis or
1438 accessions.
1439 """
1440 import warnings
1441 warnings.warn("Bio.GenBank.download_many has been deprecated, and will be"\
1442 " removed in a future release of Biopython. Please use"\
1443 " Bio.Entrez instead which is described in the tutorial.",
1444 DeprecationWarning)
1445
1446 if database in ['nucleotide']:
1447 format = 'gb'
1448 elif database in ['protein']:
1449 format = 'gp'
1450 else:
1451 raise ValueError("Unexpected database: %s" % database)
1452
1453 result_handle = Entrez.efetch(database,
1454 id=",".join(ids),
1455 retmode = "text",
1456 rettype = format)
1457 return cStringIO.StringIO(result_handle.read())
1458