Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Also, rather than using Bio.GenBank to search or download files from the NCBI, 
  14  you are now encouraged to use Bio.Entrez instead (again, see the Biopython 
  15  tutorial for details). 
  16   
  17  Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser 
  18  which turns a GenBank file into GenBank-specific Record objects.  This is a 
  19  much closer representation to the raw file contents that the SeqRecord 
  20  alternative from the FeatureParser (used in Bio.SeqIO). 
  21   
  22  Classes: 
  23  Iterator              Iterate through a file of GenBank entries 
  24  ErrorFeatureParser    Catch errors caused during parsing. 
  25  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  26  RecordParser          Parse GenBank data into a Record object. 
  27  NCBIDictionary        Access GenBank using a dictionary interface (DEPRECATED). 
  28   
  29  _BaseGenBankConsumer  A base class for GenBank consumer that implements 
  30                        some helpful functions that are in common between 
  31                        consumers. 
  32  _FeatureConsumer      Create SeqFeature objects from info generated by 
  33                        the Scanner 
  34  _RecordConsumer       Create a GenBank record object from Scanner info. 
  35  _PrintingConsumer     A debugging consumer. 
  36   
  37  ParserFailureError    Exception indicating a failure in the parser (ie. 
  38                        scanner or consumer) 
  39  LocationParserError   Exception indiciating a problem with the spark based 
  40                        location parser. 
  41   
  42  Functions: 
  43  search_for            Do a query against GenBank (DEPRECATED). 
  44  download_many         Download many GenBank records (DEPRECATED). 
  45   
  46  17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  47  These are GenBank files that summarize the content of a project, and provide lists of 
  48  scaffold and contig files in the project. These will be in annotations['wgs'] and 
  49  annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  50  http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  51   
  52  http://is.gd/nNgk 
  53  for more details of this format, and an example. 
  54  Added by Ying Huang & Iddo Friedberg 
  55  """ 
  56  import cStringIO 
  57   
  58  # other Biopython stuff 
  59  from Bio import SeqFeature 
  60  from Bio.ParserSupport import AbstractConsumer 
  61  from Bio import Entrez 
  62   
  63  # other Bio.GenBank stuff 
  64  import LocationParser 
  65  from utils import FeatureValueCleaner 
  66  from Scanner import GenBankScanner 
  67   
  68  #Constants used to parse GenBank header lines 
  69  GENBANK_INDENT = 12 
  70  GENBANK_SPACER = " " * GENBANK_INDENT 
  71   
  72  #Constants for parsing GenBank feature lines 
  73  FEATURE_KEY_INDENT = 5 
  74  FEATURE_QUALIFIER_INDENT = 21 
  75  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  76  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  77   
78 -class Iterator:
79 """Iterator interface to move over a file of GenBank entries one at a time. 80 """
81 - def __init__(self, handle, parser = None):
82 """Initialize the iterator. 83 84 Arguments: 85 o handle - A handle with GenBank entries to iterate through. 86 o parser - An optional parser to pass the entries through before 87 returning them. If None, then the raw entry will be returned. 88 """ 89 self.handle = handle 90 self._parser = parser
91
92 - def next(self):
93 """Return the next GenBank record from the handle. 94 95 Will return None if we ran out of records. 96 """ 97 if self._parser is None : 98 lines = [] 99 while True : 100 line = self.handle.readline() 101 if not line : return None #Premature end of file? 102 lines.append(line) 103 if line.rstrip() == "//" : break 104 return "".join(lines) 105 try : 106 return self._parser.parse(self.handle) 107 except StopIteration : 108 return None
109
110 - def __iter__(self):
111 return iter(self.next, None)
112
113 -class ParserFailureError(Exception):
114 """Failure caused by some kind of problem in the parser. 115 """ 116 pass
117
118 -class LocationParserError(Exception):
119 """Could not Properly parse out a location from a GenBank file. 120 """ 121 pass
122
123 -class FeatureParser:
124 """Parse GenBank files into Seq + Feature objects. 125 """
126 - def __init__(self, debug_level = 0, use_fuzziness = 1, 127 feature_cleaner = FeatureValueCleaner()):
128 """Initialize a GenBank parser and Feature consumer. 129 130 Arguments: 131 o debug_level - An optional argument that species the amount of 132 debugging information the parser should spit out. By default we have 133 no debugging info (the fastest way to do things), but if you want 134 you can set this as high as two and see exactly where a parse fails. 135 o use_fuzziness - Specify whether or not to use fuzzy representations. 136 The default is 1 (use fuzziness). 137 o feature_cleaner - A class which will be used to clean out the 138 values of features. This class must implement the function 139 clean_value. GenBank.utils has a "standard" cleaner class, which 140 is used by default. 141 """ 142 self._scanner = GenBankScanner(debug_level) 143 self.use_fuzziness = use_fuzziness 144 self._cleaner = feature_cleaner
145
146 - def parse(self, handle):
147 """Parse the specified handle. 148 """ 149 self._consumer = _FeatureConsumer(self.use_fuzziness, 150 self._cleaner) 151 self._scanner.feed(handle, self._consumer) 152 return self._consumer.data
153
154 -class RecordParser:
155 """Parse GenBank files into Record objects 156 """
157 - def __init__(self, debug_level = 0):
158 """Initialize the parser. 159 160 Arguments: 161 o debug_level - An optional argument that species the amount of 162 debugging information the parser should spit out. By default we have 163 no debugging info (the fastest way to do things), but if you want 164 you can set this as high as two and see exactly where a parse fails. 165 """ 166 self._scanner = GenBankScanner(debug_level)
167
168 - def parse(self, handle):
169 """Parse the specified handle into a GenBank record. 170 """ 171 self._consumer = _RecordConsumer() 172 self._scanner.feed(handle, self._consumer) 173 return self._consumer.data
174
175 -class _BaseGenBankConsumer(AbstractConsumer):
176 """Abstract GenBank consumer providing useful general functions. 177 178 This just helps to eliminate some duplication in things that most 179 GenBank consumers want to do. 180 """ 181 # Special keys in GenBank records that we should remove spaces from 182 # For instance, \translation keys have values which are proteins and 183 # should have spaces and newlines removed from them. This class 184 # attribute gives us more control over specific formatting problems. 185 remove_space_keys = ["translation"] 186
187 - def __init__(self):
188 pass
189
190 - def _split_keywords(self, keyword_string):
191 """Split a string of keywords into a nice clean list. 192 """ 193 # process the keywords into a python list 194 if keyword_string == "" or keyword_string == "." : 195 keywords = "" 196 elif keyword_string[-1] == '.': 197 keywords = keyword_string[:-1] 198 else: 199 keywords = keyword_string 200 keyword_list = keywords.split(';') 201 clean_keyword_list = [x.strip() for x in keyword_list] 202 return clean_keyword_list
203
204 - def _split_accessions(self, accession_string):
205 """Split a string of accession numbers into a list. 206 """ 207 # first replace all line feeds with spaces 208 # Also, EMBL style accessions are split with ';' 209 accession = accession_string.replace("\n", " ").replace(";"," ") 210 211 return [x.strip() for x in accession.split() if x.strip()]
212
213 - def _split_taxonomy(self, taxonomy_string):
214 """Split a string with taxonomy info into a list. 215 """ 216 if not taxonomy_string or taxonomy_string=="." : 217 #Missing data, no taxonomy 218 return [] 219 220 if taxonomy_string[-1] == '.': 221 tax_info = taxonomy_string[:-1] 222 else: 223 tax_info = taxonomy_string 224 tax_list = tax_info.split(';') 225 new_tax_list = [] 226 for tax_item in tax_list: 227 new_items = tax_item.split("\n") 228 new_tax_list.extend(new_items) 229 while '' in new_tax_list: 230 new_tax_list.remove('') 231 clean_tax_list = [x.strip() for x in new_tax_list] 232 233 return clean_tax_list
234
235 - def _clean_location(self, location_string):
236 """Clean whitespace out of a location string. 237 238 The location parser isn't a fan of whitespace, so we clean it out 239 before feeding it into the parser. 240 """ 241 #Originally this imported string.whitespace and did a replace 242 #via a loop. It's simpler to just split on whitespace and rejoin 243 #the string - and this avoids importing string too. See Bug 2684. 244 return ''.join(location_string.split())
245
246 - def _remove_newlines(self, text):
247 """Remove any newlines in the passed text, returning the new string. 248 """ 249 # get rid of newlines in the qualifier value 250 newlines = ["\n", "\r"] 251 for ws in newlines: 252 text = text.replace(ws, "") 253 254 return text
255
256 - def _normalize_spaces(self, text):
257 """Replace multiple spaces in the passed text with single spaces. 258 """ 259 # get rid of excessive spaces 260 text_parts = text.split(" ") 261 text_parts = filter(None, text_parts) 262 return ' '.join(text_parts)
263
264 - def _remove_spaces(self, text):
265 """Remove all spaces from the passed text. 266 """ 267 return text.replace(" ", "")
268
269 - def _convert_to_python_numbers(self, start, end):
270 """Convert a start and end range to python notation. 271 272 In GenBank, starts and ends are defined in "biological" coordinates, 273 where 1 is the first base and [i, j] means to include both i and j. 274 275 In python, 0 is the first base and [i, j] means to include i, but 276 not j. 277 278 So, to convert "biological" to python coordinates, we need to 279 subtract 1 from the start, and leave the end and things should 280 be converted happily. 281 """ 282 new_start = start - 1 283 new_end = end 284 285 return new_start, new_end
286
287 -class _FeatureConsumer(_BaseGenBankConsumer):
288 """Create a SeqRecord object with Features to return. 289 290 Attributes: 291 o use_fuzziness - specify whether or not to parse with fuzziness in 292 feature locations. 293 o feature_cleaner - a class that will be used to provide specialized 294 cleaning-up of feature values. 295 """
296 - def __init__(self, use_fuzziness, feature_cleaner = None):
297 from Bio.SeqRecord import SeqRecord 298 _BaseGenBankConsumer.__init__(self) 299 self.data = SeqRecord(None, id = None) 300 self.data.id = None 301 self.data.description = "" 302 303 self._use_fuzziness = use_fuzziness 304 self._feature_cleaner = feature_cleaner 305 306 self._seq_type = '' 307 self._seq_data = [] 308 self._current_ref = None 309 self._cur_feature = None 310 self._cur_qualifier_key = None 311 self._cur_qualifier_value = None 312 self._expected_size = None
313
314 - def locus(self, locus_name):
315 """Set the locus name is set as the name of the Sequence. 316 """ 317 self.data.name = locus_name
318
319 - def size(self, content):
320 """Record the sequence length.""" 321 self._expected_size = int(content)
322
323 - def residue_type(self, type):
324 """Record the sequence type so we can choose an appropriate alphabet. 325 """ 326 self._seq_type = type
327
328 - def data_file_division(self, division):
329 self.data.annotations['data_file_division'] = division
330
331 - def date(self, submit_date):
332 self.data.annotations['date'] = submit_date
333
334 - def definition(self, definition):
335 """Set the definition as the description of the sequence. 336 """ 337 if self.data.description : 338 #Append to any existing description 339 #e.g. EMBL files with two DE lines. 340 self.data.description += " " + definition 341 else : 342 self.data.description = definition
343
344 - def accession(self, acc_num):
345 """Set the accession number as the id of the sequence. 346 347 If we have multiple accession numbers, the first one passed is 348 used. 349 """ 350 new_acc_nums = self._split_accessions(acc_num) 351 352 #Also record them ALL in the annotations 353 try : 354 #On the off chance there was more than one accession line: 355 for acc in new_acc_nums : 356 #Prevent repeat entries 357 if acc not in self.data.annotations['accessions'] : 358 self.data.annotations['accessions'].append(acc) 359 except KeyError : 360 self.data.annotations['accessions'] = new_acc_nums 361 362 # if we haven't set the id information yet, add the first acc num 363 if self.data.id is None: 364 if len(new_acc_nums) > 0: 365 #self.data.id = new_acc_nums[0] 366 #Use the FIRST accession as the ID, not the first on this line! 367 self.data.id = self.data.annotations['accessions'][0]
368
369 - def wgs(self, content):
370 self.data.annotations['wgs'] = content.split('-')
371
372 - def add_wgs_scafld(self, content):
373 self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
374
375 - def nid(self, content):
376 self.data.annotations['nid'] = content
377
378 - def pid(self, content):
379 self.data.annotations['pid'] = content
380
381 - def version(self, version_id):
382 #Want to use the versioned accession as the record.id 383 #This comes from the VERSION line in GenBank files, or the 384 #obsolete SV line in EMBL. For the new EMBL files we need 385 #both the version suffix from the ID line and the accession 386 #from the AC line. 387 if version_id.count(".")==1 and version_id.split(".")[1].isdigit() : 388 self.accession(version_id.split(".")[0]) 389 self.version_suffix(version_id.split(".")[1]) 390 else : 391 #For backwards compatibility... 392 self.data.id = version_id
393
394 - def project(self, content):
395 """Handle the information from the PROJECT line as a list of projects. 396 397 e.g. 398 PROJECT GenomeProject:28471 399 400 or: 401 PROJECT GenomeProject:13543 GenomeProject:99999 402 403 This is stored as dbxrefs in the SeqRecord to be consistent with the 404 projected switch of this line to DBLINK in future GenBank versions. 405 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 406 "Project:28471" as part of this transition. 407 """ 408 content = content.replace("GenomeProject:", "Project:") 409 self.data.dbxrefs.extend([p for p in content.split() if p])
410 436
437 - def version_suffix(self, version):
438 """Set the version to overwrite the id. 439 440 Since the verison provides the same information as the accession 441 number, plus some extra info, we set this as the id if we have 442 a version. 443 """ 444 #e.g. GenBank line: 445 #VERSION U49845.1 GI:1293613 446 #or the obsolete EMBL line: 447 #SV U49845.1 448 #Scanner calls consumer.version("U49845.1") 449 #which then calls consumer.version_suffix(1) 450 # 451 #e.g. EMBL new line: 452 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 453 #Scanner calls consumer.version_suffix(1) 454 assert version.isdigit() 455 self.data.annotations['sequence_version'] = int(version)
456
457 - def db_source(self, content):
458 self.data.annotations['db_source'] = content.rstrip()
459
460 - def gi(self, content):
461 self.data.annotations['gi'] = content
462
463 - def keywords(self, content):
464 self.data.annotations['keywords'] = self._split_keywords(content)
465
466 - def segment(self, content):
467 self.data.annotations['segment'] = content
468
469 - def source(self, content):
470 #Note that some software (e.g. VectorNTI) may produce an empty 471 #source (rather than using a dot/period as might be expected). 472 if content == "" : 473 source_info = "" 474 elif content[-1] == '.': 475 source_info = content[:-1] 476 else: 477 source_info = content 478 self.data.annotations['source'] = source_info
479
480 - def organism(self, content):
481 self.data.annotations['organism'] = content
482
483 - def taxonomy(self, content):
484 """Records (another line of) the taxonomy lineage. 485 """ 486 lineage = self._split_taxonomy(content) 487 try : 488 self.data.annotations['taxonomy'].extend(lineage) 489 except KeyError : 490 self.data.annotations['taxonomy'] = lineage
491
492 - def reference_num(self, content):
493 """Signal the beginning of a new reference object. 494 """ 495 # if we have a current reference that hasn't been added to 496 # the list of references, add it. 497 if self._current_ref is not None: 498 self.data.annotations['references'].append(self._current_ref) 499 else: 500 self.data.annotations['references'] = [] 501 502 self._current_ref = SeqFeature.Reference()
503
504 - def reference_bases(self, content):
505 """Attempt to determine the sequence region the reference entails. 506 507 Possible types of information we may have to deal with: 508 509 (bases 1 to 86436) 510 (sites) 511 (bases 1 to 105654; 110423 to 111122) 512 1 (residues 1 to 182) 513 """ 514 # first remove the parentheses or other junk 515 ref_base_info = content[1:-1] 516 517 all_locations = [] 518 # parse if we've got 'bases' and 'to' 519 if ref_base_info.find('bases') != -1 and \ 520 ref_base_info.find('to') != -1: 521 # get rid of the beginning 'bases' 522 ref_base_info = ref_base_info[5:] 523 locations = self._split_reference_locations(ref_base_info) 524 all_locations.extend(locations) 525 elif (ref_base_info.find("residues") >= 0 and 526 ref_base_info.find("to") >= 0): 527 residues_start = ref_base_info.find("residues") 528 # get only the information after "residues" 529 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 530 locations = self._split_reference_locations(ref_base_info) 531 all_locations.extend(locations) 532 533 # make sure if we are not finding information then we have 534 # the string 'sites' or the string 'bases' 535 elif (ref_base_info == 'sites' or 536 ref_base_info.strip() == 'bases'): 537 pass 538 # otherwise raise an error 539 else: 540 raise ValueError("Could not parse base info %s in record %s" % 541 (ref_base_info, self.data.id)) 542 543 self._current_ref.location = all_locations
544
545 - def _split_reference_locations(self, location_string):
546 """Get reference locations out of a string of reference information 547 548 The passed string should be of the form: 549 550 1 to 20; 20 to 100 551 552 This splits the information out and returns a list of location objects 553 based on the reference locations. 554 """ 555 # split possibly multiple locations using the ';' 556 all_base_info = location_string.split(';') 557 558 new_locations = [] 559 for base_info in all_base_info: 560 start, end = base_info.split('to') 561 new_start, new_end = \ 562 self._convert_to_python_numbers(int(start.strip()), 563 int(end.strip())) 564 this_location = SeqFeature.FeatureLocation(new_start, new_end) 565 new_locations.append(this_location) 566 return new_locations
567
568 - def authors(self, content):
569 if self._current_ref.authors : 570 self._current_ref.authors += ' ' + content 571 else : 572 self._current_ref.authors = content
573
574 - def consrtm(self, content):
575 if self._current_ref.consrtm : 576 self._current_ref.consrtm += ' ' + content 577 else : 578 self._current_ref.consrtm = content
579
580 - def title(self, content):
581 if self._current_ref.title : 582 self._current_ref.title += ' ' + content 583 else : 584 self._current_ref.title = content
585
586 - def journal(self, content):
587 if self._current_ref.journal : 588 self._current_ref.journal += ' ' + content 589 else : 590 self._current_ref.journal = content
591
592 - def medline_id(self, content):
593 self._current_ref.medline_id = content
594
595 - def pubmed_id(self, content):
596 self._current_ref.pubmed_id = content
597
598 - def remark(self, content):
599 """Deal with a reference comment.""" 600 if self._current_ref.comment : 601 self._current_ref.comment += ' ' + content 602 else : 603 self._current_ref.comment = content
604
605 - def comment(self, content):
606 try : 607 self.data.annotations['comment'] += "\n" + "\n".join(content) 608 except KeyError : 609 self.data.annotations['comment'] = "\n".join(content)
610
611 - def features_line(self, content):
612 """Get ready for the feature table when we reach the FEATURE line. 613 """ 614 self.start_feature_table()
615
616 - def start_feature_table(self):
617 """Indicate we've got to the start of the feature table. 618 """ 619 # make sure we've added on our last reference object 620 if self._current_ref is not None: 621 self.data.annotations['references'].append(self._current_ref) 622 self._current_ref = None
623
624 - def _add_feature(self):
625 """Utility function to add a feature to the SeqRecord. 626 627 This does all of the appropriate checking to make sure we haven't 628 left any info behind, and that we are only adding info if it 629 exists. 630 """ 631 if self._cur_feature: 632 # if we have a left over qualifier, add it to the qualifiers 633 # on the current feature 634 self._add_qualifier() 635 636 self._cur_qualifier_key = '' 637 self._cur_qualifier_value = '' 638 self.data.features.append(self._cur_feature)
639
640 - def feature_key(self, content):
641 # if we already have a feature, add it on 642 self._add_feature() 643 644 # start a new feature 645 self._cur_feature = SeqFeature.SeqFeature() 646 self._cur_feature.type = content 647 648 # assume positive strand to start with if we have DNA or cDNA 649 # (labelled as mRNA). The complement in the location will 650 # change this later if something is on the reverse strand 651 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 652 self._cur_feature.strand = 1
653
654 - def location(self, content):
655 """Parse out location information from the location string. 656 657 This uses a comprehensive but slow spark based parser to do the 658 parsing, and then translates the results of the parse into appropriate 659 Location objects. 660 """ 661 # --- first preprocess the location for the spark parser 662 663 # we need to clean up newlines and other whitespace inside 664 # the location before feeding it to the parser. 665 # locations should have no whitespace whatsoever based on the 666 # grammer 667 location_line = self._clean_location(content) 668 669 # Older records have junk like replace(266,"c") in the 670 # location line. Newer records just replace this with 671 # the number 266 and have the information in a more reasonable 672 # place. So we'll just grab out the number and feed this to the 673 # parser. We shouldn't really be losing any info this way. 674 if location_line.find('replace') != -1: 675 comma_pos = location_line.find(',') 676 location_line = location_line[8:comma_pos] 677 678 # feed everything into the scanner and parser 679 try: 680 parse_info = \ 681 LocationParser.parse(LocationParser.scan(location_line)) 682 # spark raises SystemExit errors when parsing fails 683 except SystemExit: 684 raise LocationParserError(location_line) 685 686 # print "parse_info:", repr(parse_info) 687 688 # add the parser information the current feature 689 self._set_location_info(parse_info, self._cur_feature)
690
691 - def _set_function(self, function, cur_feature):
692 """Set the location information based on a function. 693 694 This handles all of the location functions like 'join', 'complement' 695 and 'order'. 696 697 Arguments: 698 o function - A LocationParser.Function object specifying the 699 function we are acting on. 700 o cur_feature - The feature to add information to. 701 """ 702 assert isinstance(function, LocationParser.Function), \ 703 "Expected a Function object, got %s" % function 704 705 if function.name == "complement": 706 # mark the current feature as being on the opposite strand 707 cur_feature.strand = -1 708 # recursively deal with whatever is left inside the complement 709 for inner_info in function.args: 710 self._set_location_info(inner_info, cur_feature) 711 # deal with functions that have multipe internal segments that 712 # are connected somehow. 713 # join and order are current documented functions. 714 # one-of is something I ran across in old files. Treating it 715 # as a sub sequence feature seems appropriate to me. 716 # bond is some piece of junk I found in RefSeq files. I have 717 # no idea how to interpret it, so I jam it in here 718 elif (function.name == "join" or function.name == "order" or 719 function.name == "one-of" or function.name == "bond"): 720 self._set_ordering_info(function, cur_feature) 721 elif (function.name == "gap"): 722 assert len(function.args) == 1, \ 723 "Unexpected number of arguments in gap %s" % function.args 724 # make the cur information location a gap object 725 position = self._get_position(function.args[0].local_location) 726 cur_feature.location = SeqFeature.PositionGap(position) 727 else: 728 raise ValueError("Unexpected function name: %s" % function.name)
729
730 - def _set_ordering_info(self, function, cur_feature):
731 """Parse a join or order and all of the information in it. 732 733 This deals with functions that order a bunch of locations, 734 specifically 'join' and 'order'. The inner locations are 735 added as subfeatures of the top level feature 736 """ 737 # for each inner element, create a sub SeqFeature within the 738 # current feature, then get the information for this feature 739 cur_feature.location_operator = function.name 740 for inner_element in function.args: 741 new_sub_feature = SeqFeature.SeqFeature() 742 # inherit the type from the parent 743 new_sub_feature.type = cur_feature.type 744 # add the join or order info to the location_operator 745 new_sub_feature.location_operator = function.name 746 # inherit references and strand from the parent feature 747 new_sub_feature.ref = cur_feature.ref 748 new_sub_feature.ref_db = cur_feature.ref_db 749 new_sub_feature.strand = cur_feature.strand 750 751 # set the information for the inner element 752 self._set_location_info(inner_element, new_sub_feature) 753 754 # now add the feature to the sub_features 755 cur_feature.sub_features.append(new_sub_feature) 756 757 # set the location of the top -- this should be a combination of 758 # the start position of the first sub_feature and the end position 759 # of the last sub_feature 760 761 # these positions are already converted to python coordinates 762 # (when the sub_features were added) so they don't need to 763 # be converted again 764 feature_start = cur_feature.sub_features[0].location.start 765 feature_end = cur_feature.sub_features[-1].location.end 766 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 767 feature_end) 768 # Historically a join on the reverse strand has been represented 769 # in Biopython with both the parent SeqFeature and its children 770 # (the exons for a CDS) all given a strand of -1. Likewise, for 771 # a join feature on the forward strand they all have strand +1. 772 # However, we must also consider evil mixed strand examples like 773 # this, join(complement(69611..69724),139856..140087,140625..140650) 774 strands = set(sf.strand for sf in cur_feature.sub_features) 775 if len(strands)==1 : 776 cur_feature.strand = cur_feature.sub_features[0].strand 777 else : 778 cur_feature.strand = None # i.e. mixed strands
779
780 - def _set_location_info(self, parse_info, cur_feature):
781 """Set the location information for a feature from the parse info. 782 783 Arguments: 784 o parse_info - The classes generated by the LocationParser. 785 o cur_feature - The feature to add the information to. 786 """ 787 # base case -- we are out of information 788 if parse_info is None: 789 return 790 # parse a location -- this is another base_case -- we assume 791 # we have no information after a single location 792 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 793 self._set_location(parse_info, cur_feature) 794 return 795 # parse any of the functions (join, complement, etc) 796 elif isinstance(parse_info, LocationParser.Function): 797 self._set_function(parse_info, cur_feature) 798 # otherwise we are stuck and should raise an error 799 else: 800 raise ValueError("Could not parse location info: %s" 801 % parse_info)
802
803 - def _set_location(self, location, cur_feature):
804 """Set the location information for a feature. 805 806 Arguments: 807 o location - An AbsoluteLocation object specifying the info 808 about the location. 809 o cur_feature - The feature to add the information to. 810 """ 811 # check to see if we have a cross reference to another accession 812 # ie. U05344.1:514..741 813 if location.path is not None: 814 cur_feature.ref = location.path.accession 815 cur_feature.ref_db = location.path.database 816 # now get the actual location information 817 cur_feature.location = self._get_location(location.local_location)
818
819 - def _get_location(self, range_info):
820 """Return a (possibly fuzzy) location from a Range object. 821 822 Arguments: 823 o range_info - A location range (ie. something like 67..100). This 824 may also be a single position (ie 27). 825 826 This returns a FeatureLocation object. 827 If parser.use_fuzziness is set at one, the positions for the 828 end points will possibly be fuzzy. 829 """ 830 if isinstance(range_info, LocationParser.Between) \ 831 and range_info.low.val+1 == range_info.high.val: 832 #A between location like "67^68" (one based counting) is a 833 #special case (note it has zero length). In python slice 834 #notation this is 67:67, a zero length slice. See Bug 2622 835 pos = self._get_position(range_info.low) 836 return SeqFeature.FeatureLocation(pos, pos) 837 #NOTE - We can imagine between locations like "2^4", but this 838 #is just "3". Similarly, "2^5" is just "3..4" 839 # check if we just have a single base 840 elif not(isinstance(range_info, LocationParser.Range)): 841 #A single base like "785" becomes [784:785] in python 842 s_pos = self._get_position(range_info) 843 # move the single position back one to be consistent with how 844 # python indexes numbers (starting at 0) 845 s_pos.position = s_pos.position - 1 846 e_pos = self._get_position(range_info) 847 return SeqFeature.FeatureLocation(s_pos, e_pos) 848 # otherwise we need to get both sides of the range 849 else: 850 # get *Position objects for the start and end 851 start_pos = self._get_position(range_info.low) 852 end_pos = self._get_position(range_info.high) 853 854 start_pos.position, end_pos.position = \ 855 self._convert_to_python_numbers(start_pos.position, 856 end_pos.position) 857 #If the start location is a one-of position, we also need to 858 #adjust their positions to use python counting. 859 if isinstance(start_pos, SeqFeature.OneOfPosition) : 860 for p in start_pos.position_choices : 861 p.position -= 1 862 863 return SeqFeature.FeatureLocation(start_pos, end_pos)
864
865 - def _get_position(self, position):
866 """Return a (possibly fuzzy) position for a single coordinate. 867 868 Arguments: 869 o position - This is a LocationParser.* object that specifies 870 a single coordinate. We will examine the object to determine 871 the fuzziness of the position. 872 873 This is used with _get_location to parse out a location of any 874 end_point of arbitrary fuzziness. 875 """ 876 # case 1 -- just a normal number 877 if (isinstance(position, LocationParser.Integer)): 878 final_pos = SeqFeature.ExactPosition(position.val) 879 # case 2 -- we've got a > sign 880 elif isinstance(position, LocationParser.LowBound): 881 final_pos = SeqFeature.AfterPosition(position.base.val) 882 # case 3 -- we've got a < sign 883 elif isinstance(position, LocationParser.HighBound): 884 final_pos = SeqFeature.BeforePosition(position.base.val) 885 # case 4 -- we've got 100^101 886 # Is the extension is zero in this example? 887 elif isinstance(position, LocationParser.Between): 888 #NOTE - We don't *expect* this code to get called! 889 #We only except between locations like 3^4 (consecutive) 890 #which are handled in _get_location. We don't expect 891 #non consecutive variants like "2^5" as this is just "3..4". 892 #Similarly there is no reason to expect composite locations 893 #like "(3^4)..6" which should just be "4..6". 894 final_pos = SeqFeature.BetweenPosition(position.low.val, 895 position.high.val-position.low.val) 896 # case 5 -- we've got (100.101) 897 elif isinstance(position, LocationParser.TwoBound): 898 final_pos = SeqFeature.WithinPosition(position.low.val, 899 position.high.val-position.low.val) 900 # case 6 -- we've got a one-of(100, 110) location 901 elif isinstance(position, LocationParser.Function) and \ 902 position.name == "one-of": 903 # first convert all of the arguments to positions 904 position_choices = [] 905 for arg in position.args: 906 # we only handle AbsoluteLocations with no path 907 # right now. Not sure if other cases will pop up 908 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 909 "Unhandled Location type %r" % arg 910 assert arg.path is None, "Unhandled path in location" 911 position = self._get_position(arg.local_location) 912 position_choices.append(position) 913 final_pos = SeqFeature.OneOfPosition(position_choices) 914 # if it is none of these cases we've got a problem! 915 else: 916 raise ValueError("Unexpected LocationParser object %r" % 917 position) 918 919 # if we are using fuzziness return what we've got 920 if self._use_fuzziness: 921 return final_pos 922 # otherwise return an ExactPosition equivalent 923 else: 924 return SeqFeature.ExactPosition(final_pos.location)
925
926 - def _add_qualifier(self):
927 """Add a qualifier to the current feature without loss of info. 928 929 If there are multiple qualifier keys with the same name we 930 would lose some info in the dictionary, so we append a unique 931 number to the end of the name in case of conflicts. 932 """ 933 # if we've got a key from before, add it to the dictionary of 934 # qualifiers 935 if self._cur_qualifier_key: 936 key = self._cur_qualifier_key 937 value = "".join(self._cur_qualifier_value) 938 if self._feature_cleaner is not None: 939 value = self._feature_cleaner.clean_value(key, value) 940 # if the qualifier name exists, append the value 941 if key in self._cur_feature.qualifiers: 942 self._cur_feature.qualifiers[key].append(value) 943 # otherwise start a new list of the key with its values 944 else: 945 self._cur_feature.qualifiers[key] = [value]
946
947 - def feature_qualifier_name(self, content_list):
948 """When we get a qualifier key, use it as a dictionary key. 949 950 We receive a list of keys, since you can have valueless keys such as 951 /pseudo which would be passed in with the next key (since no other 952 tags separate them in the file) 953 """ 954 for content in content_list: 955 # add a qualifier if we've got one 956 self._add_qualifier() 957 958 # remove the / and = from the qualifier if they're present 959 qual_key = content.replace('/', '') 960 qual_key = qual_key.replace('=', '') 961 qual_key = qual_key.strip() 962 963 self._cur_qualifier_key = qual_key 964 self._cur_qualifier_value = []
965
966 - def feature_qualifier_description(self, content):
967 # get rid of the quotes surrounding the qualifier if we've got 'em 968 qual_value = content.replace('"', '') 969 970 self._cur_qualifier_value.append(qual_value)
971
972 - def contig_location(self, content):
973 """Deal with CONTIG information. 974 975 Most CONTIG descriptions use a join of other externally referenced 976 sequences. Currently this code tries to use the location parser, 977 and represent this as a SeqFeature with sub-features. 978 """ 979 # add a last feature if is hasn't been added, 980 # so that we don't overwrite it 981 self._add_feature() 982 # make a feature to add the information to 983 self._cur_feature = SeqFeature.SeqFeature() 984 self._cur_feature.type = "contig" 985 # now set the location on the feature using the standard 986 # location handler 987 self.location(content) 988 # add the contig information to the annotations and get rid 989 # of the feature to prevent it from being added to the feature table 990 self.data.annotations["contig"] = self._cur_feature 991 self._cur_feature = None
992
993 - def origin_name(self, content):
994 pass
995
996 - def base_count(self, content):
997 pass
998
999 - def base_number(self, content):
1000 pass
1001
1002 - def sequence(self, content):
1003 """Add up sequence information as we get it. 1004 1005 To try and make things speedier, this puts all of the strings 1006 into a list of strings, and then uses string.join later to put 1007 them together. Supposedly, this is a big time savings 1008 """ 1009 new_seq = content.replace(' ', '') 1010 new_seq = new_seq.upper() 1011 1012 self._seq_data.append(new_seq)
1013
1014 - def record_end(self, content):
1015 """Clean up when we've finished the record. 1016 """ 1017 from Bio import Alphabet 1018 from Bio.Alphabet import IUPAC 1019 from Bio.Seq import Seq, UnknownSeq 1020 1021 #Try and append the version number to the accession for the full id 1022 if self.data.id is None : 1023 assert 'accessions' not in self.data.annotations, \ 1024 self.data.annotations['accessions'] 1025 self.data.id = self.data.name #Good fall back? 1026 elif self.data.id.count('.') == 0 : 1027 try : 1028 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1029 except KeyError : 1030 pass 1031 1032 # add the last feature in the table which hasn't been added yet 1033 self._add_feature() 1034 1035 # add the sequence information 1036 # first, determine the alphabet 1037 # we default to an generic alphabet if we don't have a 1038 # seq type or have strange sequence information. 1039 seq_alphabet = Alphabet.generic_alphabet 1040 1041 # now set the sequence 1042 sequence = "".join(self._seq_data) 1043 1044 if self._expected_size is not None \ 1045 and len(sequence) != 0 \ 1046 and self._expected_size != len(sequence) : 1047 raise ValueError("Expected sequence length %i, found %i." \ 1048 % (self._expected_size, len(sequence))) 1049 1050 if self._seq_type: 1051 # mRNA is really also DNA, since it is actually cDNA 1052 if self._seq_type.find('DNA') != -1 or \ 1053 self._seq_type.find('mRNA') != -1: 1054 seq_alphabet = IUPAC.ambiguous_dna 1055 # are there ever really RNA sequences in GenBank? 1056 elif self._seq_type.find('RNA') != -1: 1057 #Even for data which was from RNA, the sequence string 1058 #is usually given as DNA (T not U). Bug 2408 1059 if "T" in sequence and "U" not in sequence: 1060 seq_alphabet = IUPAC.ambiguous_dna 1061 else : 1062 seq_alphabet = IUPAC.ambiguous_rna 1063 elif self._seq_type.find('PROTEIN') != -1 : 1064 seq_alphabet = IUPAC.protein # or extended protein? 1065 # work around ugly GenBank records which have circular or 1066 # linear but no indication of sequence type 1067 elif self._seq_type in ["circular", "linear"]: 1068 pass 1069 # we have a bug if we get here 1070 else: 1071 raise ValueError("Could not determine alphabet for seq_type %s" 1072 % self._seq_type) 1073 1074 if not sequence and self.__expected_size : 1075 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1076 else : 1077 self.data.seq = Seq(sequence, seq_alphabet)
1078
1079 -class _RecordConsumer(_BaseGenBankConsumer):
1080 """Create a GenBank Record object from scanner generated information. 1081 """
1082 - def __init__(self):
1083 _BaseGenBankConsumer.__init__(self) 1084 import Record 1085 self.data = Record.Record() 1086 1087 self._seq_data = [] 1088 self._cur_reference = None 1089 self._cur_feature = None 1090 self._cur_qualifier = None
1091
1092 - def wgs(self, content):
1093 self.data.wgs = content.split('-')
1094
1095 - def add_wgs_scafld(self, content):
1096 self.data.wgs_scafld.append(content.split('-'))
1097
1098 - def locus(self, content):
1099 self.data.locus = content
1100
1101 - def size(self, content):
1102 self.data.size = content
1103
1104 - def residue_type(self, content):
1105 self.data.residue_type = content
1106
1107 - def data_file_division(self, content):
1108 self.data.data_file_division = content
1109
1110 - def date(self, content):
1111 self.data.date = content
1112
1113 - def definition(self, content):
1114 self.data.definition = content
1115
1116 - def accession(self, content):
1117 for acc in self._split_accessions(content) : 1118 if acc not in self.data.accession : 1119 self.data.accession.append(acc)
1120
1121 - def nid(self, content):
1122 self.data.nid = content
1123
1124 - def pid(self, content):
1125 self.data.pid = content
1126
1127 - def version(self, content):
1128 self.data.version = content
1129
1130 - def db_source(self, content):
1131 self.data.db_source = content.rstrip()
1132
1133 - def gi(self, content):
1134 self.data.gi = content
1135
1136 - def keywords(self, content):
1137 self.data.keywords = self._split_keywords(content)
1138
1139 - def project(self, content):
1140 self.data.projects.extend([p for p in content.split() if p])
1141 1144
1145 - def segment(self, content):
1146 self.data.segment = content
1147
1148 - def source(self, content):
1149 self.data.source = content
1150
1151 - def organism(self, content):
1152 self.data.organism = content
1153
1154 - def taxonomy(self, content):
1155 self.data.taxonomy = self._split_taxonomy(content)
1156
1157 - def reference_num(self, content):
1158 """Grab the reference number and signal the start of a new reference. 1159 """ 1160 # check if we have a reference to add 1161 if self._cur_reference is not None: 1162 self.data.references.append(self._cur_reference) 1163 1164 self._cur_reference = Record.Reference() 1165 self._cur_reference.number = content
1166
1167 - def reference_bases(self, content):
1168 self._cur_reference.bases = content
1169
1170 - def authors(self, content):
1171 self._cur_reference.authors = content
1172
1173 - def consrtm(self, content):
1174 self._cur_reference.consrtm = content
1175
1176 - def title(self, content):
1177 self._cur_reference.title = content
1178
1179 - def journal(self, content):
1180 self._cur_reference.journal = content
1181
1182 - def medline_id(self, content):
1183 self._cur_reference.medline_id = content
1184
1185 - def pubmed_id(self, content):
1186 self._cur_reference.pubmed_id = content
1187
1188 - def remark(self, content):
1189 self._cur_reference.remark = content
1190
1191 - def comment(self, content):
1192 self.data.comment += "\n".join(content)
1193
1194 - def primary_ref_line(self,content):
1195 """Data for the PRIMARY line""" 1196 self.data.primary.append(content)
1197
1198 - def primary(self,content):
1199 pass
1200
1201 - def features_line(self, content):
1202 """Get ready for the feature table when we reach the FEATURE line. 1203 """ 1204 self.start_feature_table()
1205
1206 - def start_feature_table(self):
1207 """Signal the start of the feature table. 1208 """ 1209 # we need to add on the last reference 1210 if self._cur_reference is not None: 1211 self.data.references.append(self._cur_reference)
1212
1213 - def feature_key(self, content):
1214 """Grab the key of the feature and signal the start of a new feature. 1215 """ 1216 # first add on feature information if we've got any 1217 self._add_feature() 1218 1219 self._cur_feature = Record.Feature() 1220 self._cur_feature.key = content
1221
1222 - def _add_feature(self):
1223 """Utility function to add a feature to the Record. 1224 1225 This does all of the appropriate checking to make sure we haven't 1226 left any info behind, and that we are only adding info if it 1227 exists. 1228 """ 1229 if self._cur_feature is not None: 1230 # if we have a left over qualifier, add it to the qualifiers 1231 # on the current feature 1232 if self._cur_qualifier is not None: 1233 self._cur_feature.qualifiers.append(self._cur_qualifier) 1234 1235 self._cur_qualifier = None 1236 self.data.features.append(self._cur_feature)
1237
1238 - def location(self, content):
1239 self._cur_feature.location = self._clean_location(content)
1240
1241 - def feature_qualifier_name(self, content_list):
1242 """Deal with qualifier names 1243 1244 We receive a list of keys, since you can have valueless keys such as 1245 /pseudo which would be passed in with the next key (since no other 1246 tags separate them in the file) 1247 """ 1248 for content in content_list: 1249 # the record parser keeps the /s -- add them if we don't have 'em 1250 if content.find("/") != 0: 1251 content = "/%s" % content 1252 # add on a qualifier if we've got one 1253 if self._cur_qualifier is not None: 1254 self._cur_feature.qualifiers.append(self._cur_qualifier) 1255 1256 self._cur_qualifier = Record.Qualifier() 1257 self._cur_qualifier.key = content
1258
1259 - def feature_qualifier_description(self, content):
1260 # if we have info then the qualifier key should have a ='s 1261 if self._cur_qualifier.key.find("=") == -1: 1262 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1263 cur_content = self._remove_newlines(content) 1264 # remove all spaces from the value if it is a type where spaces 1265 # are not important 1266 for remove_space_key in self.__class__.remove_space_keys: 1267 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1268 cur_content = self._remove_spaces(cur_content) 1269 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1270
1271 - def base_count(self, content):
1272 self.data.base_counts = content
1273
1274 - def origin_name(self, content):
1275 self.data.origin = content
1276
1277 - def contig_location(self, content):
1278 """Signal that we have contig information to add to the record. 1279 """ 1280 self.data.contig = self._clean_location(content)
1281
1282 - def sequence(self, content):
1283 """Add sequence information to a list of sequence strings. 1284 1285 This removes spaces in the data and uppercases the sequence, and 1286 then adds it to a list of sequences. Later on we'll join this 1287 list together to make the final sequence. This is faster than 1288 adding on the new string every time. 1289 """ 1290 new_seq = content.replace(' ', '') 1291 self._seq_data.append(new_seq.upper())
1292
1293 - def record_end(self, content):
1294 """Signal the end of the record and do any necessary clean-up. 1295 """ 1296 # add together all of the sequence parts to create the 1297 # final sequence string 1298 self.data.sequence = "".join(self._seq_data) 1299 # add on the last feature 1300 self._add_feature()
1301 1302
1303 -class NCBIDictionary:
1304 """Access GenBank using a read-only dictionary interface (DEPRECATED). 1305 1306 This object is deprecated and will be removed in a future release of 1307 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1308 """ 1309 VALID_DATABASES = ['nucleotide', 'protein', 'genome'] 1310 VALID_FORMATS = ['genbank', 'fasta']
1311 - def __init__(self, database, format, parser = None):
1312 """Initialize an NCBI dictionary to retrieve sequences. 1313 1314 Create a new Dictionary to access GenBank. Valid values for 1315 database are 'nucleotide' and 'protein'. 1316 Valid values for format are 'genbank' (for nucleotide genbank and 1317 protein genpept) and 'fasta'. 1318 dely and retmax are old options kept only for compatibility -- do not 1319 bother to set them. 1320 parser is an optional parser object 1321 to change the results into another form. If unspecified, then 1322 the raw contents of the file will be returned. 1323 """ 1324 import warnings 1325 warnings.warn("Bio.GenBank.NCBIDictionary has been deprecated, and will be"\ 1326 " removed in a future release of Biopython. Please use"\ 1327 " Bio.Entrez instead which is described in the tutorial.", 1328 DeprecationWarning) 1329 1330 self.parser = parser 1331 if database not in self.__class__.VALID_DATABASES: 1332 raise ValueError("Invalid database %s, should be one of %s" % 1333 (database, self.__class__.VALID_DATABASES)) 1334 if format not in self.__class__.VALID_FORMATS: 1335 raise ValueError("Invalid format %s, should be one of %s" % 1336 (format, self.__class__.VALID_FORMATS)) 1337 1338 if format=="genbank": format = "gb" 1339 self.db = database 1340 self.format = format
1341
1342 - def __len__(self):
1343 raise NotImplementedError("GenBank contains lots of entries")
1344 - def clear(self):
1345 raise NotImplementedError("This is a read-only dictionary")
1346 - def __setitem__(self, key, item):
1347 raise NotImplementedError("This is a read-only dictionary")
1348 - def update(self):
1349 raise NotImplementedError("This is a read-only dictionary")
1350 - def copy(self):
1351 raise NotImplementedError("You don't need to do this...")
1352 - def keys(self):
1353 raise NotImplementedError("You don't really want to do this...")
1354 - def items(self):
1355 raise NotImplementedError("You don't really want to do this...")
1356 - def values(self):
1357 raise NotImplementedError("You don't really want to do this...")
1358
1359 - def has_key(self, id):
1360 """S.has_key(id) -> bool""" 1361 try: 1362 self[id] 1363 except KeyError: 1364 return 0 1365 return 1
1366
1367 - def get(self, id, failobj=None):
1368 try: 1369 return self[id] 1370 except KeyError: 1371 return failobj
1372
1373 - def __getitem__(self, id):
1374 """Return the GenBank entry specified by the GenBank ID. 1375 1376 Raises a KeyError if there's an error. 1377 """ 1378 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format) 1379 # Parse the record if a parser was passed in. 1380 if self.parser is not None: 1381 return self.parser.parse(handle) 1382 return handle.read()
1383
1384 -def search_for(search, database='nucleotide', 1385 reldate=None, mindate=None, maxdate=None, 1386 start_id = 0, max_ids = 50000000):
1387 """Do an online search at the NCBI, returns a list of IDs (DEPRECATED). 1388 1389 This function is deprecated and will be removed in a future release of 1390 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1391 1392 Search GenBank and return a list of the GenBank identifiers (gi's) 1393 that match the criteria. search is the search string used to 1394 search the database. Valid values for database are 1395 'nucleotide', 'protein', 'popset' and 'genome'. reldate is 1396 the number of dates prior to the current date to restrict the 1397 search. mindate and maxdate are the dates to restrict the search, 1398 e.g. 2002/12/20. start_id is the number to begin retrieval on. 1399 max_ids specifies the maximum number of id's to retrieve. 1400 """ 1401 import warnings 1402 warnings.warn("Bio.GenBank.search_for has been deprecated, and will be"\ 1403 " removed in a future release of Biopython. Please use"\ 1404 " Bio.Entrez instead which is described in the tutorial.", 1405 DeprecationWarning) 1406 1407 # mindate and maxdate are NCBI parameters in "YYYY/MM/DD" format 1408 # (and both should be supplied or neither) 1409 # relate is an NCBI parameter for "within N days" 1410 1411 #Following date checking from Bio/EUtils/Datatypes.py, 1412 import re 1413 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match 1414 errinfo = None 1415 if mindate is not None and _date_re_match(mindate) is None: 1416 errinfo = ("mindate", mindate) 1417 elif maxdate is not None and _date_re_match(maxdate) is None: 1418 errinfo = ("maxdate", maxdate) 1419 if errinfo: 1420 raise TypeError( 1421 "%s is not in YYYY/MM/DD format (month and " 1422 "day are optional): %r" % errinfo) 1423 1424 #Bio.Entrez can now ignore None arguments automatically 1425 handle = Entrez.esearch(database, search, retmode="xml", 1426 retstart=start_id, retmax=max_ids, 1427 mindate=mindate, maxdate=maxdate, 1428 reldate=reldate) 1429 return Entrez.read(handle)["IdList"]
1430
1431 -def download_many(ids, database = 'nucleotide'):
1432 """Download multiple NCBI GenBank records, returned as a handle (DEPRECATED). 1433 1434 This function is deprecated and will be removed in a future release of 1435 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1436 1437 Download many records from GenBank. ids is a list of gis or 1438 accessions. 1439 """ 1440 import warnings 1441 warnings.warn("Bio.GenBank.download_many has been deprecated, and will be"\ 1442 " removed in a future release of Biopython. Please use"\ 1443 " Bio.Entrez instead which is described in the tutorial.", 1444 DeprecationWarning) 1445 1446 if database in ['nucleotide']: 1447 format = 'gb' 1448 elif database in ['protein']: 1449 format = 'gp' 1450 else: 1451 raise ValueError("Unexpected database: %s" % database) 1452 #TODO - Batch the queries? 1453 result_handle = Entrez.efetch(database, 1454 id=",".join(ids), 1455 retmode = "text", 1456 rettype = format) 1457 return cStringIO.StringIO(result_handle.read())
1458