Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Classes: 
  10  Iterator              Iterate through a file of GenBank entries 
  11  Dictionary            Access a GenBank file using a dictionary interface. 
  12  ErrorFeatureParser    Catch errors caused during parsing. 
  13  FeatureParser         Parse GenBank data in Seq and SeqFeature objects. 
  14  RecordParser          Parse GenBank data into a Record object. 
  15  NCBIDictionary        Access GenBank using a dictionary interface. 
  16   
  17  _BaseGenBankConsumer  A base class for GenBank consumer that implements 
  18                        some helpful functions that are in common between 
  19                        consumers. 
  20  _FeatureConsumer      Create SeqFeature objects from info generated by 
  21                        the Scanner 
  22  _RecordConsumer       Create a GenBank record object from Scanner info. 
  23  _PrintingConsumer     A debugging consumer. 
  24   
  25  ParserFailureError    Exception indicating a failure in the parser (ie. 
  26                        scanner or consumer) 
  27  LocationParserError   Exception indiciating a problem with the spark based 
  28                        location parser. 
  29   
  30  Functions: 
  31  search_for            Do a query against GenBank. 
  32  download_many         Download many GenBank records. 
  33   
  34  """ 
  35  import cStringIO 
  36   
  37  # other Biopython stuff 
  38  from Bio.ParserSupport import AbstractConsumer 
  39  from utils import FeatureValueCleaner 
  40  from Bio import Entrez 
  41   
  42  #There used to be a (GenBank only) class _Scanner in 
  43  #this file.  Now use a more generic system which we import: 
  44  from Scanner import GenBankScanner 
  45   
  46  #These are used for downloading files from GenBank 
  47  from Bio import EUtils 
  48  from Bio.EUtils import DBIds, DBIdsClient 
  49   
  50  #Constants used to parse GenBank header lines 
  51  GENBANK_INDENT = 12 
  52  GENBANK_SPACER = " " * GENBANK_INDENT 
  53   
  54  #Constants for parsing GenBank feature lines 
  55  FEATURE_KEY_INDENT = 5 
  56  FEATURE_QUALIFIER_INDENT = 21 
  57  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  58  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  59           
60 -class Iterator:
61 """Iterator interface to move over a file of GenBank entries one at a time. 62 """
63 - def __init__(self, handle, parser = None):
64 """Initialize the iterator. 65 66 Arguments: 67 o handle - A handle with GenBank entries to iterate through. 68 o parser - An optional parser to pass the entries through before 69 returning them. If None, then the raw entry will be returned. 70 """ 71 self.handle = handle 72 self._parser = parser
73
74 - def next(self):
75 """Return the next GenBank record from the handle. 76 77 Will return None if we ran out of records. 78 """ 79 if self._parser is None : 80 lines = [] 81 while True : 82 line = self.handle.readline() 83 if not line : return None #Premature end of file? 84 lines.append(line) 85 if line.rstrip() == "//" : break 86 return "".join(lines) 87 try : 88 return self._parser.parse(self.handle) 89 except StopIteration : 90 return None
91
92 - def __iter__(self):
93 return iter(self.next, None)
94
95 -class ParserFailureError(Exception):
96 """Failure caused by some kind of problem in the parser. 97 """ 98 pass
99
100 -class LocationParserError(Exception):
101 """Could not Properly parse out a location from a GenBank file. 102 """ 103 pass
104
105 -class FeatureParser:
106 """Parse GenBank files into Seq + Feature objects. 107 """
108 - def __init__(self, debug_level = 0, use_fuzziness = 1, 109 feature_cleaner = FeatureValueCleaner()):
110 """Initialize a GenBank parser and Feature consumer. 111 112 Arguments: 113 o debug_level - An optional argument that species the amount of 114 debugging information the parser should spit out. By default we have 115 no debugging info (the fastest way to do things), but if you want 116 you can set this as high as two and see exactly where a parse fails. 117 o use_fuzziness - Specify whether or not to use fuzzy representations. 118 The default is 1 (use fuzziness). 119 o feature_cleaner - A class which will be used to clean out the 120 values of features. This class must implement the function 121 clean_value. GenBank.utils has a "standard" cleaner class, which 122 is used by default. 123 """ 124 self._scanner = GenBankScanner(debug_level) 125 self.use_fuzziness = use_fuzziness 126 self._cleaner = feature_cleaner
127
128 - def parse(self, handle):
129 """Parse the specified handle. 130 """ 131 self._consumer = _FeatureConsumer(self.use_fuzziness, 132 self._cleaner) 133 self._scanner.feed(handle, self._consumer) 134 return self._consumer.data
135
136 -class RecordParser:
137 """Parse GenBank files into Record objects 138 """
139 - def __init__(self, debug_level = 0):
140 """Initialize the parser. 141 142 Arguments: 143 o debug_level - An optional argument that species the amount of 144 debugging information the parser should spit out. By default we have 145 no debugging info (the fastest way to do things), but if you want 146 you can set this as high as two and see exactly where a parse fails. 147 """ 148 self._scanner = GenBankScanner(debug_level)
149
150 - def parse(self, handle):
151 """Parse the specified handle into a GenBank record. 152 """ 153 self._consumer = _RecordConsumer() 154 self._scanner.feed(handle, self._consumer) 155 return self._consumer.data
156
157 -class _BaseGenBankConsumer(AbstractConsumer):
158 """Abstract GenBank consumer providing useful general functions. 159 160 This just helps to eliminate some duplication in things that most 161 GenBank consumers want to do. 162 """ 163 # Special keys in GenBank records that we should remove spaces from 164 # For instance, \translation keys have values which are proteins and 165 # should have spaces and newlines removed from them. This class 166 # attribute gives us more control over specific formatting problems. 167 remove_space_keys = ["translation"] 168
169 - def __init__(self):
170 pass
171
172 - def _split_keywords(self, keyword_string):
173 """Split a string of keywords into a nice clean list. 174 """ 175 # process the keywords into a python list 176 if keyword_string == "" or keyword_string == "." : 177 keywords = "" 178 elif keyword_string[-1] == '.': 179 keywords = keyword_string[:-1] 180 else: 181 keywords = keyword_string 182 keyword_list = keywords.split(';') 183 clean_keyword_list = [x.strip() for x in keyword_list] 184 return clean_keyword_list
185
186 - def _split_accessions(self, accession_string):
187 """Split a string of accession numbers into a list. 188 """ 189 # first replace all line feeds with spaces 190 # Also, EMBL style accessions are split with ';' 191 accession = accession_string.replace("\n", " ").replace(";"," ") 192 193 return [x.strip() for x in accession.split() if x.strip()]
194
195 - def _split_taxonomy(self, taxonomy_string):
196 """Split a string with taxonomy info into a list. 197 """ 198 if not taxonomy_string or taxonomy_string=="." : 199 #Missing data, no taxonomy 200 return [] 201 202 if taxonomy_string[-1] == '.': 203 tax_info = taxonomy_string[:-1] 204 else: 205 tax_info = taxonomy_string 206 tax_list = tax_info.split(';') 207 new_tax_list = [] 208 for tax_item in tax_list: 209 new_items = tax_item.split("\n") 210 new_tax_list.extend(new_items) 211 while '' in new_tax_list: 212 new_tax_list.remove('') 213 clean_tax_list = [x.strip() for x in new_tax_list] 214 215 return clean_tax_list
216
217 - def _clean_location(self, location_string):
218 """Clean whitespace out of a location string. 219 220 The location parser isn't a fan of whitespace, so we clean it out 221 before feeding it into the parser. 222 """ 223 import string 224 location_line = location_string 225 for ws in string.whitespace: 226 location_line = location_line.replace(ws, '') 227 228 return location_line
229
230 - def _remove_newlines(self, text):
231 """Remove any newlines in the passed text, returning the new string. 232 """ 233 # get rid of newlines in the qualifier value 234 newlines = ["\n", "\r"] 235 for ws in newlines: 236 text = text.replace(ws, "") 237 238 return text
239
240 - def _normalize_spaces(self, text):
241 """Replace multiple spaces in the passed text with single spaces. 242 """ 243 # get rid of excessive spaces 244 text_parts = text.split(" ") 245 text_parts = filter(None, text_parts) 246 return ' '.join(text_parts)
247
248 - def _remove_spaces(self, text):
249 """Remove all spaces from the passed text. 250 """ 251 return text.replace(" ", "")
252
253 - def _convert_to_python_numbers(self, start, end):
254 """Convert a start and end range to python notation. 255 256 In GenBank, starts and ends are defined in "biological" coordinates, 257 where 1 is the first base and [i, j] means to include both i and j. 258 259 In python, 0 is the first base and [i, j] means to include i, but 260 not j. 261 262 So, to convert "biological" to python coordinates, we need to 263 subtract 1 from the start, and leave the end and things should 264 be converted happily. 265 """ 266 new_start = start - 1 267 new_end = end 268 269 return new_start, new_end
270
271 -class _FeatureConsumer(_BaseGenBankConsumer):
272 """Create a SeqRecord object with Features to return. 273 274 Attributes: 275 o use_fuzziness - specify whether or not to parse with fuzziness in 276 feature locations. 277 o feature_cleaner - a class that will be used to provide specialized 278 cleaning-up of feature values. 279 """
280 - def __init__(self, use_fuzziness, feature_cleaner = None):
281 from Bio.SeqRecord import SeqRecord 282 _BaseGenBankConsumer.__init__(self) 283 self.data = SeqRecord(None, id = None) 284 self.data.id = None 285 self.data.description = "" 286 287 self._use_fuzziness = use_fuzziness 288 self._feature_cleaner = feature_cleaner 289 290 self._seq_type = '' 291 self._seq_data = [] 292 self._current_ref = None 293 self._cur_feature = None 294 self._cur_qualifier_key = None 295 self._cur_qualifier_value = None
296
297 - def locus(self, locus_name):
298 """Set the locus name is set as the name of the Sequence. 299 """ 300 self.data.name = locus_name
301
302 - def size(self, content):
303 pass
304
305 - def residue_type(self, type):
306 """Record the sequence type so we can choose an appropriate alphabet. 307 """ 308 self._seq_type = type
309
310 - def data_file_division(self, division):
311 self.data.annotations['data_file_division'] = division
312
313 - def date(self, submit_date):
314 self.data.annotations['date'] = submit_date
315
316 - def definition(self, definition):
317 """Set the definition as the description of the sequence. 318 """ 319 if self.data.description : 320 #Append to any existing description 321 #e.g. EMBL files with two DE lines. 322 self.data.description += " " + definition 323 else : 324 self.data.description = definition
325
326 - def accession(self, acc_num):
327 """Set the accession number as the id of the sequence. 328 329 If we have multiple accession numbers, the first one passed is 330 used. 331 """ 332 new_acc_nums = self._split_accessions(acc_num) 333 334 #Also record them ALL in the annotations 335 try : 336 #On the off chance there was more than one accession line: 337 for acc in new_acc_nums : 338 #Prevent repeat entries 339 if acc not in self.data.annotations['accessions'] : 340 self.data.annotations['accessions'].append(acc) 341 except KeyError : 342 self.data.annotations['accessions'] = new_acc_nums 343 344 # if we haven't set the id information yet, add the first acc num 345 if self.data.id is None: 346 if len(new_acc_nums) > 0: 347 #self.data.id = new_acc_nums[0] 348 #Use the FIRST accession as the ID, not the first on this line! 349 self.data.id = self.data.annotations['accessions'][0]
350 351
352 - def nid(self, content):
353 self.data.annotations['nid'] = content
354
355 - def pid(self, content):
356 self.data.annotations['pid'] = content
357
358 - def version(self, version_id):
359 #Want to use the versioned accension as the record.id 360 #This comes from the VERSION line in GenBank files, or the 361 #obsolete SV line in EMBL. For the new EMBL files we need 362 #both the version suffix from the ID line and the accession 363 #from the AC line. 364 if version_id.count(".")==1 and version_id.split(".")[1].isdigit() : 365 self.accession(version_id.split(".")[0]) 366 self.version_suffix(version_id.split(".")[1]) 367 else : 368 #For backwards compatibility... 369 self.data.id = version_id
370
371 - def version_suffix(self, version):
372 """Set the version to overwrite the id. 373 374 Since the verison provides the same information as the accession 375 number, plus some extra info, we set this as the id if we have 376 a version. 377 """ 378 #e.g. GenBank line: 379 #VERSION U49845.1 GI:1293613 380 #or the obsolete EMBL line: 381 #SV U49845.1 382 #Scanner calls consumer.version("U49845.1") 383 #which then calls consumer.version_suffix(1) 384 # 385 #e.g. EMBL new line: 386 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 387 #Scanner calls consumer.version_suffix(1) 388 assert version.isdigit() 389 self.data.annotations['sequence_version'] = int(version)
390
391 - def db_source(self, content):
392 self.data.annotations['db_source'] = content.rstrip()
393
394 - def gi(self, content):
395 self.data.annotations['gi'] = content
396
397 - def keywords(self, content):
398 self.data.annotations['keywords'] = self._split_keywords(content)
399
400 - def segment(self, content):
401 self.data.annotations['segment'] = content
402
403 - def source(self, content):
404 if content[-1] == '.': 405 source_info = content[:-1] 406 else: 407 source_info = content 408 self.data.annotations['source'] = source_info
409
410 - def organism(self, content):
411 self.data.annotations['organism'] = content
412
413 - def taxonomy(self, content):
414 """Records (another line of) the taxonomy lineage. 415 """ 416 lineage = self._split_taxonomy(content) 417 try : 418 self.data.annotations['taxonomy'].extend(lineage) 419 except KeyError : 420 self.data.annotations['taxonomy'] = lineage
421
422 - def reference_num(self, content):
423 """Signal the beginning of a new reference object. 424 """ 425 from Bio.SeqFeature import Reference 426 # if we have a current reference that hasn't been added to 427 # the list of references, add it. 428 if self._current_ref is not None: 429 self.data.annotations['references'].append(self._current_ref) 430 else: 431 self.data.annotations['references'] = [] 432 433 self._current_ref = Reference()
434
435 - def reference_bases(self, content):
436 """Attempt to determine the sequence region the reference entails. 437 438 Possible types of information we may have to deal with: 439 440 (bases 1 to 86436) 441 (sites) 442 (bases 1 to 105654; 110423 to 111122) 443 1 (residues 1 to 182) 444 """ 445 # first remove the parentheses or other junk 446 ref_base_info = content[1:-1] 447 448 all_locations = [] 449 # parse if we've got 'bases' and 'to' 450 if ref_base_info.find('bases') != -1 and \ 451 ref_base_info.find('to') != -1: 452 # get rid of the beginning 'bases' 453 ref_base_info = ref_base_info[5:] 454 locations = self._split_reference_locations(ref_base_info) 455 all_locations.extend(locations) 456 elif (ref_base_info.find("residues") >= 0 and 457 ref_base_info.find("to") >= 0): 458 residues_start = ref_base_info.find("residues") 459 # get only the information after "residues" 460 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 461 locations = self._split_reference_locations(ref_base_info) 462 all_locations.extend(locations) 463 464 # make sure if we are not finding information then we have 465 # the string 'sites' or the string 'bases' 466 elif (ref_base_info == 'sites' or 467 ref_base_info.strip() == 'bases'): 468 pass 469 # otherwise raise an error 470 else: 471 raise ValueError("Could not parse base info %s in record %s" % 472 (ref_base_info, self.data.id)) 473 474 self._current_ref.location = all_locations
475
476 - def _split_reference_locations(self, location_string):
477 """Get reference locations out of a string of reference information 478 479 The passed string should be of the form: 480 481 1 to 20; 20 to 100 482 483 This splits the information out and returns a list of location objects 484 based on the reference locations. 485 """ 486 from Bio import SeqFeature 487 # split possibly multiple locations using the ';' 488 all_base_info = location_string.split(';') 489 490 new_locations = [] 491 for base_info in all_base_info: 492 start, end = base_info.split('to') 493 new_start, new_end = \ 494 self._convert_to_python_numbers(int(start.strip()), 495 int(end.strip())) 496 this_location = SeqFeature.FeatureLocation(new_start, new_end) 497 new_locations.append(this_location) 498 return new_locations
499
500 - def authors(self, content):
501 self._current_ref.authors = content
502
503 - def consrtm(self, content):
504 self._current_ref.consrtm = content
505
506 - def title(self, content):
507 self._current_ref.title = content
508
509 - def journal(self, content):
510 self._current_ref.journal = content
511
512 - def medline_id(self, content):
513 self._current_ref.medline_id = content
514
515 - def pubmed_id(self, content):
516 self._current_ref.pubmed_id = content
517
518 - def remark(self, content):
519 self._current_ref.comment = content
520
521 - def comment(self, content):
522 try : 523 self.data.annotations['comment'] += "\n" + "\n".join(content) 524 except KeyError : 525 self.data.annotations['comment'] = "\n".join(content)
526
527 - def features_line(self, content):
528 """Get ready for the feature table when we reach the FEATURE line. 529 """ 530 self.start_feature_table()
531
532 - def start_feature_table(self):
533 """Indicate we've got to the start of the feature table. 534 """ 535 # make sure we've added on our last reference object 536 if self._current_ref is not None: 537 self.data.annotations['references'].append(self._current_ref) 538 self._current_ref = None
539
540 - def _add_feature(self):
541 """Utility function to add a feature to the SeqRecord. 542 543 This does all of the appropriate checking to make sure we haven't 544 left any info behind, and that we are only adding info if it 545 exists. 546 """ 547 if self._cur_feature: 548 # if we have a left over qualifier, add it to the qualifiers 549 # on the current feature 550 self._add_qualifier() 551 552 self._cur_qualifier_key = '' 553 self._cur_qualifier_value = '' 554 self.data.features.append(self._cur_feature)
555
556 - def feature_key(self, content):
557 from Bio import SeqFeature 558 # if we already have a feature, add it on 559 self._add_feature() 560 561 # start a new feature 562 self._cur_feature = SeqFeature.SeqFeature() 563 self._cur_feature.type = content 564 565 # assume positive strand to start with if we have DNA or cDNA 566 # (labelled as mRNA). The complement in the location will 567 # change this later if something is on the reverse strand 568 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 569 self._cur_feature.strand = 1
570
571 - def location(self, content):
572 """Parse out location information from the location string. 573 574 This uses Andrew's nice spark based parser to do the parsing 575 for us, and translates the results of the parse into appropriate 576 Location objects. 577 """ 578 from Bio.GenBank import LocationParser 579 # --- first preprocess the location for the spark parser 580 581 # we need to clean up newlines and other whitespace inside 582 # the location before feeding it to the parser. 583 # locations should have no whitespace whatsoever based on the 584 # grammer 585 location_line = self._clean_location(content) 586 587 # Older records have junk like replace(266,"c") in the 588 # location line. Newer records just replace this with 589 # the number 266 and have the information in a more reasonable 590 # place. So we'll just grab out the number and feed this to the 591 # parser. We shouldn't really be losing any info this way. 592 if location_line.find('replace') != -1: 593 comma_pos = location_line.find(',') 594 location_line = location_line[8:comma_pos] 595 596 # feed everything into the scanner and parser 597 try: 598 parse_info = \ 599 LocationParser.parse(LocationParser.scan(location_line)) 600 # spark raises SystemExit errors when parsing fails 601 except SystemExit: 602 raise LocationParserError(location_line) 603 604 # print "parse_info:", repr(parse_info) 605 606 # add the parser information the current feature 607 self._set_location_info(parse_info, self._cur_feature)
608
609 - def _set_function(self, function, cur_feature):
610 """Set the location information based on a function. 611 612 This handles all of the location functions like 'join', 'complement' 613 and 'order'. 614 615 Arguments: 616 o function - A LocationParser.Function object specifying the 617 function we are acting on. 618 o cur_feature - The feature to add information to. 619 """ 620 from Bio import SeqFeature 621 from Bio.GenBank import LocationParser 622 assert isinstance(function, LocationParser.Function), \ 623 "Expected a Function object, got %s" % function 624 625 if function.name == "complement": 626 # mark the current feature as being on the opposite strand 627 cur_feature.strand = -1 628 # recursively deal with whatever is left inside the complement 629 for inner_info in function.args: 630 self._set_location_info(inner_info, cur_feature) 631 # deal with functions that have multipe internal segments that 632 # are connected somehow. 633 # join and order are current documented functions. 634 # one-of is something I ran across in old files. Treating it 635 # as a sub sequence feature seems appropriate to me. 636 # bond is some piece of junk I found in RefSeq files. I have 637 # no idea how to interpret it, so I jam it in here 638 elif (function.name == "join" or function.name == "order" or 639 function.name == "one-of" or function.name == "bond"): 640 self._set_ordering_info(function, cur_feature) 641 elif (function.name == "gap"): 642 assert len(function.args) == 1, \ 643 "Unexpected number of arguments in gap %s" % function.args 644 # make the cur information location a gap object 645 position = self._get_position(function.args[0].local_location) 646 cur_feature.location = SeqFeature.PositionGap(position) 647 else: 648 raise ValueError("Unexpected function name: %s" % function.name)
649
650 - def _set_ordering_info(self, function, cur_feature):
651 """Parse a join or order and all of the information in it. 652 653 This deals with functions that order a bunch of locations, 654 specifically 'join' and 'order'. The inner locations are 655 added as subfeatures of the top level feature 656 """ 657 from Bio import SeqFeature 658 # for each inner element, create a sub SeqFeature within the 659 # current feature, then get the information for this feature 660 for inner_element in function.args: 661 new_sub_feature = SeqFeature.SeqFeature() 662 # inherit the type from the parent 663 new_sub_feature.type = cur_feature.type 664 # add the join or order info to the location_operator 665 cur_feature.location_operator = function.name 666 new_sub_feature.location_operator = function.name 667 # inherit references and strand from the parent feature 668 new_sub_feature.ref = cur_feature.ref 669 new_sub_feature.ref_db = cur_feature.ref_db 670 new_sub_feature.strand = cur_feature.strand 671 672 # set the information for the inner element 673 self._set_location_info(inner_element, new_sub_feature) 674 675 # now add the feature to the sub_features 676 cur_feature.sub_features.append(new_sub_feature) 677 678 # set the location of the top -- this should be a combination of 679 # the start position of the first sub_feature and the end position 680 # of the last sub_feature 681 682 # these positions are already converted to python coordinates 683 # (when the sub_features were added) so they don't need to 684 # be converted again 685 feature_start = cur_feature.sub_features[0].location.start 686 feature_end = cur_feature.sub_features[-1].location.end 687 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 688 feature_end)
689
690 - def _set_location_info(self, parse_info, cur_feature):
691 """Set the location information for a feature from the parse info. 692 693 Arguments: 694 o parse_info - The classes generated by the LocationParser. 695 o cur_feature - The feature to add the information to. 696 """ 697 from Bio.GenBank import LocationParser 698 # base case -- we are out of information 699 if parse_info is None: 700 return 701 # parse a location -- this is another base_case -- we assume 702 # we have no information after a single location 703 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 704 self._set_location(parse_info, cur_feature) 705 return 706 # parse any of the functions (join, complement, etc) 707 elif isinstance(parse_info, LocationParser.Function): 708 self._set_function(parse_info, cur_feature) 709 # otherwise we are stuck and should raise an error 710 else: 711 raise ValueError("Could not parse location info: %s" 712 % parse_info)
713
714 - def _set_location(self, location, cur_feature):
715 """Set the location information for a feature. 716 717 Arguments: 718 o location - An AbsoluteLocation object specifying the info 719 about the location. 720 o cur_feature - The feature to add the information to. 721 """ 722 # check to see if we have a cross reference to another accession 723 # ie. U05344.1:514..741 724 if location.path is not None: 725 cur_feature.ref = location.path.accession 726 cur_feature.ref_db = location.path.database 727 # now get the actual location information 728 cur_feature.location = self._get_location(location.local_location)
729
730 - def _get_location(self, range_info):
731 """Return a (possibly fuzzy) location from a Range object. 732 733 Arguments: 734 o range_info - A location range (ie. something like 67..100). This 735 may also be a single position (ie 27). 736 737 This returns a FeatureLocation object. 738 If parser.use_fuzziness is set at one, the positions for the 739 end points will possibly be fuzzy. 740 """ 741 from Bio import SeqFeature 742 from Bio.GenBank import LocationParser 743 # check if we just have a single base 744 if not(isinstance(range_info, LocationParser.Range)): 745 pos = self._get_position(range_info) 746 # move the single position back one to be consistent with how 747 # python indexes numbers (starting at 0) 748 pos.position = pos.position - 1 749 return SeqFeature.FeatureLocation(pos, pos) 750 # otherwise we need to get both sides of the range 751 else: 752 # get *Position objects for the start and end 753 start_pos = self._get_position(range_info.low) 754 end_pos = self._get_position(range_info.high) 755 756 start_pos.position, end_pos.position = \ 757 self._convert_to_python_numbers(start_pos.position, 758 end_pos.position) 759 760 return SeqFeature.FeatureLocation(start_pos, end_pos)
761
762 - def _get_position(self, position):
763 """Return a (possibly fuzzy) position for a single coordinate. 764 765 Arguments: 766 o position - This is a LocationParser.* object that specifies 767 a single coordinate. We will examine the object to determine 768 the fuzziness of the position. 769 770 This is used with _get_location to parse out a location of any 771 end_point of arbitrary fuzziness. 772 """ 773 from Bio import SeqFeature 774 from Bio.GenBank import LocationParser 775 # case 1 -- just a normal number 776 if (isinstance(position, LocationParser.Integer)): 777 final_pos = SeqFeature.ExactPosition(position.val) 778 # case 2 -- we've got a > sign 779 elif isinstance(position, LocationParser.LowBound): 780 final_pos = SeqFeature.AfterPosition(position.base.val) 781 # case 3 -- we've got a < sign 782 elif isinstance(position, LocationParser.HighBound): 783 final_pos = SeqFeature.BeforePosition(position.base.val) 784 # case 4 -- we've got 100^101 785 elif isinstance(position, LocationParser.Between): 786 final_pos = SeqFeature.BetweenPosition(position.low.val, 787 position.high.val) 788 # case 5 -- we've got (100.101) 789 elif isinstance(position, LocationParser.TwoBound): 790 final_pos = SeqFeature.WithinPosition(position.low.val, 791 position.high.val) 792 # case 6 -- we've got a one-of(100, 110) location 793 elif isinstance(position, LocationParser.Function) and \ 794 position.name == "one-of": 795 # first convert all of the arguments to positions 796 position_choices = [] 797 for arg in position.args: 798 # we only handle AbsoluteLocations with no path 799 # right now. Not sure if other cases will pop up 800 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 801 "Unhandled Location type %r" % arg 802 assert arg.path is None, "Unhandled path in location" 803 position = self._get_position(arg.local_location) 804 position_choices.append(position) 805 final_pos = SeqFeature.OneOfPosition(position_choices) 806 # if it is none of these cases we've got a problem! 807 else: 808 raise ValueError("Unexpected LocationParser object %r" % 809 position) 810 811 # if we are using fuzziness return what we've got 812 if self._use_fuzziness: 813 return final_pos 814 # otherwise return an ExactPosition equivalent 815 else: 816 return SeqFeature.ExactPosition(final_pos.location)
817
818 - def _add_qualifier(self):
819 """Add a qualifier to the current feature without loss of info. 820 821 If there are multiple qualifier keys with the same name we 822 would lose some info in the dictionary, so we append a unique 823 number to the end of the name in case of conflicts. 824 """ 825 # if we've got a key from before, add it to the dictionary of 826 # qualifiers 827 if self._cur_qualifier_key: 828 key = self._cur_qualifier_key 829 value = "".join(self._cur_qualifier_value) 830 if self._feature_cleaner is not None: 831 value = self._feature_cleaner.clean_value(key, value) 832 # if the qualifier name exists, append the value 833 if key in self._cur_feature.qualifiers: 834 self._cur_feature.qualifiers[key].append(value) 835 # otherwise start a new list of the key with its values 836 else: 837 self._cur_feature.qualifiers[key] = [value]
838
839 - def feature_qualifier_name(self, content_list):
840 """When we get a qualifier key, use it as a dictionary key. 841 842 We receive a list of keys, since you can have valueless keys such as 843 /pseudo which would be passed in with the next key (since no other 844 tags separate them in the file) 845 """ 846 for content in content_list: 847 # add a qualifier if we've got one 848 self._add_qualifier() 849 850 # remove the / and = from the qualifier if they're present 851 qual_key = content.replace('/', '') 852 qual_key = qual_key.replace('=', '') 853 qual_key = qual_key.strip() 854 855 self._cur_qualifier_key = qual_key 856 self._cur_qualifier_value = []
857
858 - def feature_qualifier_description(self, content):
859 # get rid of the quotes surrounding the qualifier if we've got 'em 860 qual_value = content.replace('"', '') 861 862 self._cur_qualifier_value.append(qual_value)
863
864 - def contig_location(self, content):
865 """Deal with a location of CONTIG information. 866 """ 867 from Bio import SeqFeature 868 # add a last feature if is hasn't been added, 869 # so that we don't overwrite it 870 self._add_feature() 871 872 # make a feature to add the information to 873 self._cur_feature = SeqFeature.SeqFeature() 874 self._cur_feature.type = "contig" 875 876 # now set the location on the feature using the standard 877 # location handler 878 self.location(content) 879 # add the contig information to the annotations and get rid 880 # of the feature to prevent it from being added to the feature table 881 self.data.annotations["contig"] = self._cur_feature 882 self._cur_feature = None
883
884 - def origin_name(self, content):
885 pass
886
887 - def base_count(self, content):
888 pass
889
890 - def base_number(self, content):
891 pass
892
893 - def sequence(self, content):
894 """Add up sequence information as we get it. 895 896 To try and make things speedier, this puts all of the strings 897 into a list of strings, and then uses string.join later to put 898 them together. Supposedly, this is a big time savings 899 """ 900 new_seq = content.replace(' ', '') 901 new_seq = new_seq.upper() 902 903 self._seq_data.append(new_seq)
904
905 - def record_end(self, content):
906 """Clean up when we've finished the record. 907 """ 908 from Bio import Alphabet 909 from Bio.Alphabet import IUPAC 910 from Bio.Seq import Seq 911 912 #Try and append the version number to the accession for the full id 913 if self.data.id.count('.') == 0 : 914 try : 915 self.data.id+='.%i' % self.data.annotations['sequence_version'] 916 except KeyError : 917 pass 918 919 # add the last feature in the table which hasn't been added yet 920 self._add_feature() 921 922 # add the sequence information 923 # first, determine the alphabet 924 # we default to an generic alphabet if we don't have a 925 # seq type or have strange sequence information. 926 seq_alphabet = Alphabet.generic_alphabet 927 928 # now set the sequence 929 sequence = "".join(self._seq_data) 930 931 if self._seq_type: 932 # mRNA is really also DNA, since it is actually cDNA 933 if self._seq_type.find('DNA') != -1 or \ 934 self._seq_type.find('mRNA') != -1: 935 seq_alphabet = IUPAC.ambiguous_dna 936 # are there ever really RNA sequences in GenBank? 937 elif self._seq_type.find('RNA') != -1: 938 #Even for data which was from RNA, the sequence string 939 #is usually given as DNA (T not U). Bug 2408 940 if "T" in sequence and "U" not in sequence: 941 seq_alphabet = IUPAC.ambiguous_dna 942 else : 943 seq_alphabet = IUPAC.ambiguous_rna 944 elif self._seq_type.find('PROTEIN') != -1 : 945 seq_alphabet = IUPAC.protein # or extended protein? 946 # work around ugly GenBank records which have circular or 947 # linear but no indication of sequence type 948 elif self._seq_type in ["circular", "linear"]: 949 pass 950 # we have a bug if we get here 951 else: 952 raise ValueError("Could not determine alphabet for seq_type %s" 953 % self._seq_type) 954 955 self.data.seq = Seq(sequence, seq_alphabet)
956
957 -class _RecordConsumer(_BaseGenBankConsumer):
958 """Create a GenBank Record object from scanner generated information. 959 """
960 - def __init__(self):
961 _BaseGenBankConsumer.__init__(self) 962 import Record 963 self.data = Record.Record() 964 965 self._seq_data = [] 966 self._cur_reference = None 967 self._cur_feature = None 968 self._cur_qualifier = None
969
970 - def locus(self, content):
971 self.data.locus = content
972
973 - def size(self, content):
974 self.data.size = content
975
976 - def residue_type(self, content):
977 self.data.residue_type = content
978
979 - def data_file_division(self, content):
980 self.data.data_file_division = content
981
982 - def date(self, content):
983 self.data.date = content
984
985 - def definition(self, content):
986 self.data.definition = content
987
988 - def accession(self, content):
989 for acc in self._split_accessions(content) : 990 if acc not in self.data.accession : 991 self.data.accession.append(acc)
992
993 - def nid(self, content):
994 self.data.nid = content
995
996 - def pid(self, content):
997 self.data.pid = content
998
999 - def version(self, content):
1000 self.data.version = content
1001
1002 - def db_source(self, content):
1003 self.data.db_source = content.rstrip()
1004
1005 - def gi(self, content):
1006 self.data.gi = content
1007
1008 - def keywords(self, content):
1009 self.data.keywords = self._split_keywords(content)
1010
1011 - def segment(self, content):
1012 self.data.segment = content
1013
1014 - def source(self, content):
1015 self.data.source = content
1016
1017 - def organism(self, content):
1018 self.data.organism = content
1019
1020 - def taxonomy(self, content):
1021 self.data.taxonomy = self._split_taxonomy(content)
1022
1023 - def reference_num(self, content):
1024 """Grab the reference number and signal the start of a new reference. 1025 """ 1026 # check if we have a reference to add 1027 if self._cur_reference is not None: 1028 self.data.references.append(self._cur_reference) 1029 1030 self._cur_reference = Record.Reference() 1031 self._cur_reference.number = content
1032
1033 - def reference_bases(self, content):
1034 self._cur_reference.bases = content
1035
1036 - def authors(self, content):
1037 self._cur_reference.authors = content
1038
1039 - def consrtm(self, content):
1040 self._cur_reference.consrtm = content
1041
1042 - def title(self, content):
1043 self._cur_reference.title = content
1044
1045 - def journal(self, content):
1046 self._cur_reference.journal = content
1047
1048 - def medline_id(self, content):
1049 self._cur_reference.medline_id = content
1050
1051 - def pubmed_id(self, content):
1052 self._cur_reference.pubmed_id = content
1053
1054 - def remark(self, content):
1055 self._cur_reference.remark = content
1056
1057 - def comment(self, content):
1058 self.data.comment += "\n".join(content)
1059
1060 - def primary_ref_line(self,content):
1061 """Data for the PRIMARY line""" 1062 self.data.primary.append(content)
1063
1064 - def primary(self,content):
1065 pass
1066
1067 - def features_line(self, content):
1068 """Get ready for the feature table when we reach the FEATURE line. 1069 """ 1070 self.start_feature_table()
1071
1072 - def start_feature_table(self):
1073 """Signal the start of the feature table. 1074 """ 1075 # we need to add on the last reference 1076 if self._cur_reference is not None: 1077 self.data.references.append(self._cur_reference)
1078
1079 - def feature_key(self, content):
1080 """Grab the key of the feature and signal the start of a new feature. 1081 """ 1082 # first add on feature information if we've got any 1083 self._add_feature() 1084 1085 self._cur_feature = Record.Feature() 1086 self._cur_feature.key = content
1087
1088 - def _add_feature(self):
1089 """Utility function to add a feature to the Record. 1090 1091 This does all of the appropriate checking to make sure we haven't 1092 left any info behind, and that we are only adding info if it 1093 exists. 1094 """ 1095 if self._cur_feature is not None: 1096 # if we have a left over qualifier, add it to the qualifiers 1097 # on the current feature 1098 if self._cur_qualifier is not None: 1099 self._cur_feature.qualifiers.append(self._cur_qualifier) 1100 1101 self._cur_qualifier = None 1102 self.data.features.append(self._cur_feature)
1103
1104 - def location(self, content):
1105 self._cur_feature.location = self._clean_location(content)
1106
1107 - def feature_qualifier_name(self, content_list):
1108 """Deal with qualifier names 1109 1110 We receive a list of keys, since you can have valueless keys such as 1111 /pseudo which would be passed in with the next key (since no other 1112 tags separate them in the file) 1113 """ 1114 for content in content_list: 1115 # the record parser keeps the /s -- add them if we don't have 'em 1116 if content.find("/") != 0: 1117 content = "/%s" % content 1118 # add on a qualifier if we've got one 1119 if self._cur_qualifier is not None: 1120 self._cur_feature.qualifiers.append(self._cur_qualifier) 1121 1122 self._cur_qualifier = Record.Qualifier() 1123 self._cur_qualifier.key = content
1124
1125 - def feature_qualifier_description(self, content):
1126 # if we have info then the qualifier key should have a ='s 1127 if self._cur_qualifier.key.find("=") == -1: 1128 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1129 cur_content = self._remove_newlines(content) 1130 # remove all spaces from the value if it is a type where spaces 1131 # are not important 1132 for remove_space_key in self.__class__.remove_space_keys: 1133 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1134 cur_content = self._remove_spaces(cur_content) 1135 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1136
1137 - def base_count(self, content):
1138 self.data.base_counts = content
1139
1140 - def origin_name(self, content):
1141 self.data.origin = content
1142
1143 - def contig_location(self, content):
1144 """Signal that we have contig information to add to the record. 1145 """ 1146 self.data.contig = self._clean_location(content)
1147
1148 - def sequence(self, content):
1149 """Add sequence information to a list of sequence strings. 1150 1151 This removes spaces in the data and uppercases the sequence, and 1152 then adds it to a list of sequences. Later on we'll join this 1153 list together to make the final sequence. This is faster than 1154 adding on the new string every time. 1155 """ 1156 new_seq = content.replace(' ', '') 1157 self._seq_data.append(new_seq.upper())
1158
1159 - def record_end(self, content):
1160 """Signal the end of the record and do any necessary clean-up. 1161 """ 1162 # add together all of the sequence parts to create the 1163 # final sequence string 1164 self.data.sequence = "".join(self._seq_data) 1165 # add on the last feature 1166 self._add_feature()
1167 1168
1169 -class NCBIDictionary:
1170 """Access GenBank using a read-only dictionary interface. 1171 """ 1172 VALID_DATABASES = ['nucleotide', 'protein', 'genome'] 1173 VALID_FORMATS = ['genbank', 'fasta']
1174 - def __init__(self, database, format, parser = None):
1175 """Initialize an NCBI dictionary to retrieve sequences. 1176 1177 Create a new Dictionary to access GenBank. Valid values for 1178 database are 'nucleotide' and 'protein'. 1179 Valid values for format are 'genbank' (for nucleotide genbank and 1180 protein genpept) and 'fasta'. 1181 dely and retmax are old options kept only for compatibility -- do not 1182 bother to set them. 1183 parser is an optional parser object 1184 to change the results into another form. If unspecified, then 1185 the raw contents of the file will be returned. 1186 """ 1187 self.parser = parser 1188 if database not in self.__class__.VALID_DATABASES: 1189 raise ValueError("Invalid database %s, should be one of %s" % 1190 (database, self.__class__.VALID_DATABASES)) 1191 if format not in self.__class__.VALID_FORMATS: 1192 raise ValueError("Invalid format %s, should be one of %s" % 1193 (format, self.__class__.VALID_FORMATS)) 1194 1195 if format=="genbank": format = "gb" 1196 self.db = database 1197 self.format = format
1198
1199 - def __len__(self):
1200 raise NotImplementedError, "GenBank contains lots of entries"
1201 - def clear(self):
1202 raise NotImplementedError, "This is a read-only dictionary"
1203 - def __setitem__(self, key, item):
1204 raise NotImplementedError, "This is a read-only dictionary"
1205 - def update(self):
1206 raise NotImplementedError, "This is a read-only dictionary"
1207 - def copy(self):
1208 raise NotImplementedError, "You don't need to do this..."
1209 - def keys(self):
1210 raise NotImplementedError, "You don't really want to do this..."
1211 - def items(self):
1212 raise NotImplementedError, "You don't really want to do this..."
1213 - def values(self):
1214 raise NotImplementedError, "You don't really want to do this..."
1215
1216 - def has_key(self, id):
1217 """S.has_key(id) -> bool""" 1218 try: 1219 self[id] 1220 except KeyError: 1221 return 0 1222 return 1
1223
1224 - def get(self, id, failobj=None):
1225 try: 1226 return self[id] 1227 except KeyError: 1228 return failobj 1229 raise "How did I get here?"
1230
1231 - def __getitem__(self, id):
1232 """Return the GenBank entry specified by the GenBank ID. 1233 1234 Raises a KeyError if there's an error. 1235 """ 1236 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format) 1237 # Parse the record if a parser was passed in. 1238 if self.parser is not None: 1239 return self.parser.parse(handle) 1240 return handle.read()
1241
1242 -def search_for(search, database='nucleotide', 1243 reldate=None, mindate=None, maxdate=None, 1244 start_id = 0, max_ids = 50000000):
1245 """search_for(search[, reldate][, mindate][, maxdate] 1246 [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids 1247 1248 Search GenBank and return a list of the GenBank identifiers (gi's) 1249 that match the criteria. search is the search string used to 1250 search the database. Valid values for database are 1251 'nucleotide', 'protein', 'popset' and 'genome'. reldate is 1252 the number of dates prior to the current date to restrict the 1253 search. mindate and maxdate are the dates to restrict the search, 1254 e.g. 2002/01/01. start_id is the number to begin retrieval on. 1255 max_ids specifies the maximum number of id's to retrieve. 1256 1257 batchsize, delay and callback_fn are old parameters for 1258 compatibility -- do not set them. 1259 """ 1260 # deal with dates 1261 date_restrict = None 1262 if reldate: 1263 date_restrict = EUtils.WithinNDays(reldate) 1264 elif mindate: 1265 date_restrict = EUtils.DateRange(mindate, maxdate) 1266 1267 eutils_client = DBIdsClient.DBIdsClient() 1268 db_ids = eutils_client.search(search, database, daterange = date_restrict, 1269 retstart = start_id, retmax = max_ids) 1270 ids = [] 1271 for db_id in db_ids: 1272 ids.append(db_id.dbids.ids[0]) 1273 return ids
1274
1275 -def download_many(ids, database = 'nucleotide'):
1276 """download_many(ids, database) -> handle of results 1277 1278 Download many records from GenBank. ids is a list of gis or 1279 accessions. 1280 1281 callback_fn, broken_fn, delay, faildelay, batchsize, parser are old 1282 parameter for compatibility. They should not be used. 1283 """ 1284 db_ids = DBIds(database, ids) 1285 if database in ['nucleotide']: 1286 format = 'gb' 1287 elif database in ['protein']: 1288 format = 'gp' 1289 else: 1290 raise ValueError("Unexpected database: %s" % database) 1291 1292 eutils_client = DBIdsClient.from_dbids(db_ids) 1293 result_handle = eutils_client.efetch(retmode = "text", rettype = format) 1294 return cStringIO.StringIO(result_handle.read())
1295