Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2009 by Peter Cock.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # This code is NOT intended for direct use.  It provides a basic scanner 
   7  # (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
   8  # to parse a GenBank or EMBL file (with their shared INSDC feature table). 
   9  # 
  10  # It is used by Bio.GenBank to parse GenBank files 
  11  # It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  12  # 
  13  # Feature Table Documentation: 
  14  # http://www.insdc.org/files/feature_table.html 
  15  # http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  16  # ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  17  # 
  18  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  19  # These are GenBank files that summarize the content of a project, and provide lists of 
  20  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  21  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  22  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  23  # http://is.gd/nNgk 
  24  # for more details of this format, and an example. 
  25  # Added by Ying Huang & Iddo Friedberg 
  26   
  27  import warnings 
  28  import os 
  29  from Bio.Seq import Seq 
  30  from Bio.SeqRecord import SeqRecord 
  31  from Bio.Alphabet import generic_alphabet, generic_protein 
  32   
33 -class InsdcScanner :
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 35 36 The International Nucleotide Sequence Database Collaboration (INSDC) 37 between the DDBJ, EMBL, and GenBank. These organisations all use the 38 same "Feature Table" layout in their plain text flat file formats. 39 40 However, the header and sequence sections of an EMBL file are very 41 different in layout to those produced by GenBank/DDBJ.""" 42 43 #These constants get redefined with sensible values in the sub classes: 44 RECORD_START = "XXX" # "LOCUS " or "ID " 45 HEADER_WIDTH = 3 # 12 or 5 46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 48 FEATURE_QUALIFIER_INDENT = 0 49 FEATURE_QUALIFIER_SPACER = "" 50 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 51
52 - def __init__(self, debug=0) :
53 assert len(self.RECORD_START)==self.HEADER_WIDTH 54 for marker in self.SEQUENCE_HEADERS : 55 assert marker==marker.rstrip() 56 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 57 self.debug = debug 58 self.line = None
59
60 - def set_handle(self, handle) :
61 self.handle = handle 62 self.line = ""
63
64 - def find_start(self) :
65 """Read in lines until find the ID/LOCUS line, which is returned. 66 67 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 68 will we ignored.""" 69 while True : 70 if self.line : 71 line = self.line 72 self.line = "" 73 else : 74 line = self.handle.readline() 75 if not line : 76 if self.debug : print "End of file" 77 return None 78 if line[:self.HEADER_WIDTH]==self.RECORD_START : 79 if self.debug > 1: print "Found the start of a record:\n" + line 80 break 81 line = line.rstrip() 82 if line == "//" : 83 if self.debug > 1: print "Skipping // marking end of last record" 84 elif line == "" : 85 if self.debug > 1: print "Skipping blank line before record" 86 else : 87 #Ignore any header before the first ID/LOCUS line. 88 if self.debug > 1: 89 print "Skipping header line before record:\n" + line 90 self.line = line 91 return line
92
93 - def parse_header(self) :
94 """Return list of strings making up the header 95 96 New line characters are removed. 97 98 Assumes you have just read in the ID/LOCUS line. 99 """ 100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 101 "Not at start of record" 102 103 header_lines = [] 104 while True : 105 line = self.handle.readline() 106 if not line : 107 raise ValueError("Premature end of line during sequence data") 108 line = line.rstrip() 109 if line in self.FEATURE_START_MARKERS : 110 if self.debug : print "Found header table" 111 break 112 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH] : 113 # if self.debug : print "Found header table (?)" 114 # break 115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS : 116 if self.debug : print "Found start of sequence" 117 break 118 if line == "//" : 119 raise ValueError("Premature end of sequence data marker '//' found") 120 header_lines.append(line) 121 self.line = line 122 return header_lines
123
124 - def parse_features(self, skip=False) :
125 """Return list of tuples for the features (if present) 126 127 Each feature is returned as a tuple (key, location, qualifiers) 128 where key and location are strings (e.g. "CDS" and 129 "complement(join(490883..490885,1..879))") while qualifiers 130 is a list of two string tuples (feature qualifier keys and values). 131 132 Assumes you have already read to the start of the features table. 133 """ 134 if self.line.rstrip() not in self.FEATURE_START_MARKERS : 135 if self.debug : print "Didn't find any feature table" 136 return [] 137 138 while self.line.rstrip() in self.FEATURE_START_MARKERS : 139 self.line = self.handle.readline() 140 141 features = [] 142 line = self.line 143 while True : 144 if not line : 145 raise ValueError("Premature end of line during features table") 146 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS : 147 if self.debug : print "Found start of sequence" 148 break 149 line = line.rstrip() 150 if line == "//" : 151 raise ValueError("Premature end of features table, marker '//' found") 152 if line in self.FEATURE_END_MARKERS : 153 if self.debug : print "Found end of features" 154 line = self.handle.readline() 155 break 156 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "" : 157 raise ValueError("Expected a feature qualifier in line '%s'" % line) 158 159 if skip : 160 line = self.handle.readline() 161 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER : 162 line = self.handle.readline() 163 else : 164 #Build up a list of the lines making up this feature: 165 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 166 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 167 line = self.handle.readline() 168 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 169 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 170 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].rstrip()) 171 line = self.handle.readline() 172 features.append(self.parse_feature(feature_key, feature_lines)) 173 self.line = line 174 return features
175
176 - def parse_feature(self, feature_key, lines) :
177 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 178 179 For example given this GenBank feature: 180 181 CDS complement(join(490883..490885,1..879)) 182 /locus_tag="NEQ001" 183 /note="conserved hypothetical [Methanococcus jannaschii]; 184 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 185 localization signal; IPR002743: Protein of unknown 186 function DUF57" 187 /codon_start=1 188 /transl_table=11 189 /product="hypothetical protein" 190 /protein_id="NP_963295.1" 191 /db_xref="GI:41614797" 192 /db_xref="GeneID:2732620" 193 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 194 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 195 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 196 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 197 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 198 LNSMGFGFVNTKKNSAR" 199 200 Then should give input key="CDS" and the rest of the data as a list of strings 201 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 202 where the leading spaces and trailing newlines have been removed. 203 204 Returns tuple containing: (key as string, location string, qualifiers as list) 205 as follows for this example: 206 207 key = "CDS", string 208 location = "complement(join(490883..490885,1..879))", string 209 qualifiers = list of string tuples: 210 211 [('locus_tag', '"NEQ001"'), 212 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 213 ('codon_start', '1'), 214 ('transl_table', '11'), 215 ('product', '"hypothetical protein"'), 216 ('protein_id', '"NP_963295.1"'), 217 ('db_xref', '"GI:41614797"'), 218 ('db_xref', '"GeneID:2732620"'), 219 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 220 221 In the above example, the "note" and "translation" were edited for compactness, 222 and they would contain multiple new line characters (displayed above as \n) 223 224 If a qualifier is quoted (in this case, everything except codon_start and 225 transl_table) then the quotes are NOT removed. 226 227 Note that no whitespace is removed. 228 """ 229 #Skip any blank lines 230 iterator = iter(filter(None, lines)) 231 try : 232 line = iterator.next() 233 234 feature_location = line.strip() 235 while feature_location[-1:]=="," : 236 #Multiline location, still more to come! 237 feature_location += iterator.next().strip() 238 239 qualifiers=[] 240 241 for line in iterator : 242 if line[0]=="/" : 243 #New qualifier 244 i = line.find("=") 245 key = line[1:i] #does not work if i==-1 246 value = line[i+1:] #we ignore 'value' if i==-1 247 if i==-1 : 248 #Qualifier with no key, e.g. /pseudo 249 key = line[1:] 250 qualifiers.append((key,None)) 251 elif value[0]=='"' : 252 #Quoted... 253 if value[-1]!='"' or value!='"' : 254 #No closing quote on the first line... 255 while value[-1] != '"' : 256 value += "\n" + iterator.next() 257 else : 258 #One single line (quoted) 259 assert value == '"' 260 if self.debug : print "Quoted line %s:%s" % (key, value) 261 #DO NOT remove the quotes... 262 qualifiers.append((key,value)) 263 else : 264 #Unquoted 265 #if debug : print "Unquoted line %s:%s" % (key,value) 266 qualifiers.append((key,value)) 267 else : 268 #Unquoted continuation 269 assert len(qualifiers) > 0 270 assert key==qualifiers[-1][0] 271 #if debug : print "Unquoted Cont %s:%s" % (key, line) 272 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 273 return (feature_key, feature_location, qualifiers) 274 except StopIteration: 275 #Bummer 276 raise ValueError("Problem with '%s' feature:\n%s" \ 277 % (feature_key, "\n".join(lines)))
278 299
300 - def _feed_first_line(self, consumer, line) :
301 """Handle the LOCUS/ID line, passing data to the comsumer 302 303 This should be implemented by the EMBL / GenBank specific subclass 304 305 Used by the parse_records() and parse() methods. 306 """ 307 pass
308
309 - def _feed_header_lines(self, consumer, lines) :
310 """Handle the header lines (list of strings), passing data to the comsumer 311 312 This should be implemented by the EMBL / GenBank specific subclass 313 314 Used by the parse_records() and parse() methods. 315 """ 316 pass
317 318
319 - def _feed_feature_table(self, consumer, feature_tuples) :
320 """Handle the feature table (list of tuples), passing data to the comsumer 321 322 Used by the parse_records() and parse() methods. 323 """ 324 consumer.start_feature_table() 325 for feature_key, location_string, qualifiers in feature_tuples : 326 consumer.feature_key(feature_key) 327 consumer.location(location_string) 328 for q_key, q_value in qualifiers : 329 consumer.feature_qualifier_name([q_key]) 330 if q_value is not None : 331 consumer.feature_qualifier_description(q_value.replace("\n"," "))
332
333 - def _feed_misc_lines(self, consumer, lines) :
334 """Handle any lines between features and sequence (list of strings), passing data to the consumer 335 336 This should be implemented by the EMBL / GenBank specific subclass 337 338 Used by the parse_records() and parse() methods. 339 """ 340 pass
341
342 - def feed(self, handle, consumer, do_features=True) :
343 """Feed a set of data into the consumer. 344 345 This method is intended for use with the "old" code in Bio.GenBank 346 347 Arguments: 348 handle - A handle with the information to parse. 349 consumer - The consumer that should be informed of events. 350 do_features - Boolean, should the features be parsed? 351 Skipping the features can be much faster. 352 353 Return values: 354 true - Passed a record 355 false - Did not find a record 356 """ 357 #Should work with both EMBL and GenBank files provided the 358 #equivalent Bio.GenBank._FeatureConsumer methods are called... 359 self.set_handle(handle) 360 if not self.find_start() : 361 #Could not find (another) record 362 consumer.data=None 363 return False 364 365 #We use the above class methods to parse the file into a simplified format. 366 #The first line, header lines and any misc lines after the features will be 367 #dealt with by GenBank / EMBL specific derived classes. 368 369 #First line and header: 370 self._feed_first_line(consumer, self.line) 371 self._feed_header_lines(consumer, self.parse_header()) 372 373 #Features (common to both EMBL and GenBank): 374 if do_features : 375 self._feed_feature_table(consumer, self.parse_features(skip=False)) 376 else : 377 self.parse_features(skip=True) # ignore the data 378 379 #Footer and sequence 380 misc_lines, sequence_string = self.parse_footer() 381 self._feed_misc_lines(consumer, misc_lines) 382 383 consumer.sequence(sequence_string) 384 #Calls to consumer.base_number() do nothing anyway 385 consumer.record_end("//") 386 387 assert self.line == "//" 388 389 #And we are done 390 return True
391
392 - def parse(self, handle, do_features=True) :
393 """Returns a SeqRecord (with SeqFeatures if do_features=True) 394 395 See also the method parse_records() for use on multi-record files. 396 """ 397 from Bio.GenBank import _FeatureConsumer 398 from Bio.GenBank.utils import FeatureValueCleaner 399 400 consumer = _FeatureConsumer(use_fuzziness = 1, 401 feature_cleaner = FeatureValueCleaner()) 402 403 if self.feed(handle, consumer, do_features) : 404 return consumer.data 405 else : 406 return None
407 408
409 - def parse_records(self, handle, do_features=True) :
410 """Returns a SeqRecord object iterator 411 412 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 413 414 The SeqRecord objects include SeqFeatures if do_features=True 415 416 This method is intended for use in Bio.SeqIO 417 """ 418 #This is a generator function 419 while True : 420 record = self.parse(handle, do_features) 421 if record is None : break 422 assert record.id is not None 423 assert record.name != "<unknown name>" 424 assert record.description != "<unknown description>" 425 yield record
426
427 - def parse_cds_features(self, handle, 428 alphabet=generic_protein, 429 tags2id=('protein_id','locus_tag','product')) :
430 """Returns SeqRecord object iterator 431 432 Each CDS feature becomes a SeqRecord. 433 434 alphabet - Used for any sequence found in a translation field. 435 tags2id - Tupple of three strings, the feature keys to use 436 for the record id, name and description, 437 438 This method is intended for use in Bio.SeqIO 439 """ 440 self.set_handle(handle) 441 while self.find_start() : 442 #Got an EMBL or GenBank record... 443 self.parse_header() # ignore header lines! 444 feature_tuples = self.parse_features() 445 #self.parse_footer() # ignore footer lines! 446 while True : 447 line = self.handle.readline() 448 if not line : break 449 if line[:2]=="//" : break 450 self.line = line.rstrip() 451 452 #Now go though those features... 453 for key, location_string, qualifiers in feature_tuples : 454 if key=="CDS" : 455 #Create SeqRecord 456 #================ 457 #SeqRecord objects cannot be created with annotations, they 458 #must be added afterwards. So create an empty record and 459 #then populate it: 460 record = SeqRecord(seq=None) 461 annotations = record.annotations 462 463 #Should we add a location object to the annotations? 464 #I *think* that only makes sense for SeqFeatures with their 465 #sub features... 466 annotations['raw_location'] = location_string.replace(' ','') 467 468 for (qualifier_name, qualifier_data) in qualifiers : 469 if qualifier_data is not None \ 470 and qualifier_data[0]=='"' and qualifier_data[-1]=='"' : 471 #Remove quotes 472 qualifier_data = qualifier_data[1:-1] 473 #Append the data to the annotation qualifier... 474 if qualifier_name == "translation" : 475 assert record.seq is None, "Multiple translations!" 476 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 477 elif qualifier_name == "db_xref" : 478 #its a list, possibly empty. Its safe to extend 479 record.dbxrefs.append(qualifier_data) 480 else : 481 if qualifier_data is not None : 482 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 483 try : 484 annotations[qualifier_name] += " " + qualifier_data 485 except KeyError : 486 #Not an addition to existing data, its the first bit 487 annotations[qualifier_name]= qualifier_data 488 489 #Fill in the ID, Name, Description 490 #================================= 491 try : 492 record.id = annotations[tags2id[0]] 493 except KeyError : 494 pass 495 try : 496 record.name = annotations[tags2id[1]] 497 except KeyError : 498 pass 499 try : 500 record.description = annotations[tags2id[2]] 501 except KeyError : 502 pass 503 504 yield record
505
506 -class EmblScanner(InsdcScanner) :
507 """For extracting chunks of information in EMBL files""" 508 509 RECORD_START = "ID " 510 HEADER_WIDTH = 5 511 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 512 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 513 FEATURE_QUALIFIER_INDENT = 21 514 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 515 SEQUENCE_HEADERS=["SQ"] #Remove trailing spaces 516 550
551 - def _feed_first_line(self, consumer, line) :
552 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 553 if line[self.HEADER_WIDTH:].count(";") == 6 : 554 #Looks like the semi colon separated style introduced in 2006 555 self._feed_first_line_new(consumer, line) 556 elif line[self.HEADER_WIDTH:].count(";") == 3 : 557 #Looks like the pre 2006 style 558 self._feed_first_line_old(consumer, line) 559 else : 560 raise ValueError('Did not recognise the ID line layout:\n' + line)
561
562 - def _feed_first_line_old(self, consumer, line) :
563 #Expects an ID line in the style before 2006, e.g. 564 #ID SC10H5 standard; DNA; PRO; 4870 BP. 565 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 566 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 567 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 568 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 569 fields = [entry.strip() for entry in fields] 570 """ 571 The tokens represent: 572 0. Primary accession number 573 (space sep) 574 1. ??? (e.g. standard) 575 (semi-colon) 576 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 577 3. Taxonomic division (e.g. 'PRO') 578 4. Sequence length (e.g. '4639675 BP.') 579 """ 580 consumer.locus(fields[0]) #Should we also call the accession consumer? 581 consumer.residue_type(fields[2]) 582 consumer.data_file_division(fields[3]) 583 self._feed_seq_length(consumer, fields[4])
584
585 - def _feed_first_line_new(self, consumer, line) :
586 #Expects an ID line in the style introduced in 2006, e.g. 587 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 588 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 589 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 590 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 591 assert len(fields) == 7 592 """ 593 The tokens represent: 594 0. Primary accession number 595 1. Sequence version number 596 2. Topology: 'circular' or 'linear' 597 3. Molecule type (e.g. 'genomic DNA') 598 4. Data class (e.g. 'STD') 599 5. Taxonomic division (e.g. 'PRO') 600 6. Sequence length (e.g. '4639675 BP.') 601 """ 602 603 consumer.locus(fields[0]) 604 605 #Call the accession consumer now, to make sure we record 606 #something as the record.id, in case there is no AC line 607 consumer.accession(fields[0]) 608 609 #TODO - How to deal with the version field? At the moment the consumer 610 #will try and use this for the ID which isn't ideal for EMBL files. 611 version_parts = fields[1].split() 612 if len(version_parts)==2 \ 613 and version_parts[0]=="SV" \ 614 and version_parts[1].isdigit() : 615 consumer.version_suffix(version_parts[1]) 616 617 #Based on how the old GenBank parser worked, merge these two: 618 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 619 620 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 621 622 consumer.data_file_division(fields[5]) 623 624 self._feed_seq_length(consumer, fields[6])
625
626 - def _feed_seq_length(self, consumer, text) :
627 length_parts = text.split() 628 assert len(length_parts) == 2 629 assert length_parts[1].upper() in ["BP", "BP."] 630 consumer.size(length_parts[0])
631
632 - def _feed_header_lines(self, consumer, lines) :
633 EMBL_INDENT = self.HEADER_WIDTH 634 EMBL_SPACER = " " * EMBL_INDENT 635 consumer_dict = { 636 'AC' : 'accession', 637 'SV' : 'version', # SV line removed in June 2006, now part of ID line 638 'DE' : 'definition', 639 #'RN' : 'reference_num', 640 #'RP' : 'reference_bases', 641 #'RX' : reference cross reference... DOI or Pubmed 642 'RA' : 'authors', 643 'RT' : 'title', 644 'RL' : 'journal', 645 'OS' : 'organism', 646 'OC' : 'taxonomy', 647 #'DR' : data reference? 648 'CC' : 'comment', 649 #'XX' : splitter 650 } 651 #We have to handle the following specially: 652 #RX (depending on reference type...) 653 lines = filter(None,lines) 654 line_iter = iter(lines) 655 try : 656 while True : 657 try : 658 line = line_iter.next() 659 except StopIteration : 660 break 661 if not line : break 662 line_type = line[:EMBL_INDENT].strip() 663 data = line[EMBL_INDENT:].strip() 664 665 if line_type == 'XX' : 666 pass 667 elif line_type == 'RN' : 668 # Reformat reference numbers for the GenBank based consumer 669 # e.g. '[1]' becomes '1' 670 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 671 consumer.reference_num(data) 672 elif line_type == 'RP' : 673 # Reformat reference numbers for the GenBank based consumer 674 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 675 assert data.count("-")==1 676 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")") 677 elif line_type == 'RX' : 678 # EMBL support three reference types at the moment: 679 # - PUBMED PUBMED bibliographic database (NLM) 680 # - DOI Digital Object Identifier (International DOI Foundation) 681 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 682 # of Agriculture (USDA) 683 # 684 # Format: 685 # RX resource_identifier; identifier. 686 # 687 # e.g. 688 # RX DOI; 10.1016/0024-3205(83)90010-3. 689 # RX PUBMED; 264242. 690 # 691 # Currently our reference object only supports PUBMED and MEDLINE 692 # (as these were in GenBank files?). 693 key, value = data.split(";",1) 694 if value.endswith(".") : value = value[:-1] 695 value = value.strip() 696 if key == "PUBMED" : 697 consumer.pubmed_id(value) 698 #TODO - Handle other reference types (here and in BioSQL bindings) 699 elif line_type == 'CC' : 700 # Have to pass a list of strings for this one (not just a string) 701 consumer.comment([data]) 702 elif line_type == 'DR' : 703 # Database Cross-reference, format: 704 # DR database_identifier; primary_identifier; secondary_identifier. 705 # 706 # e.g. 707 # DR MGI; 98599; Tcrb-V4. 708 # 709 # TODO - Data reference... 710 # How should we store the secondary identifier (if present)? Ignore it? 711 pass 712 elif line_type in consumer_dict : 713 #Its a semi-automatic entry! 714 getattr(consumer, consumer_dict[line_type])(data) 715 else : 716 if self.debug : 717 print "Ignoring EMBL header line:\n%s" % line 718 except StopIteration : 719 raise ValueError("Problem with header")
720
721 - def _feed_misc_lines(self, consumer, lines) :
722 #TODO - Should we do something with the information on the SQ line(s)? 723 pass
724
725 -class GenBankScanner(InsdcScanner) :
726 """For extracting chunks of information in GenBank files""" 727 728 RECORD_START = "LOCUS " 729 HEADER_WIDTH = 12 730 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 731 FEATURE_END_MARKERS = [] 732 FEATURE_QUALIFIER_INDENT = 21 733 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 734 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 735 778
779 - def _feed_first_line(self, consumer, line) :
780 ##################################### 781 # LOCUS line # 782 ##################################### 783 GENBANK_INDENT = self.HEADER_WIDTH 784 GENBANK_SPACER = " "*GENBANK_INDENT 785 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 786 'LOCUS line does not start correctly:\n' + line 787 788 #Have to break up the locus line, and handle the different bits of it. 789 #There are at least two different versions of the locus line... 790 if line[29:33] in [' bp ', ' aa ',' rc '] : 791 #Old... 792 # 793 # Positions Contents 794 # --------- -------- 795 # 00:06 LOCUS 796 # 06:12 spaces 797 # 12:?? Locus name 798 # ??:?? space 799 # ??:29 Length of sequence, right-justified 800 # 29:33 space, bp, space 801 # 33:41 strand type 802 # 41:42 space 803 # 42:51 Blank (implies linear), linear or circular 804 # 51:52 space 805 # 52:55 The division code (e.g. BCT, VRL, INV) 806 # 55:62 space 807 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 808 # 809 assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 810 'LOCUS line does not contain size units at expected position:\n' + line 811 assert line[41:42] == ' ', \ 812 'LOCUS line does not contain space at position 42:\n' + line 813 assert line[42:51].strip() in ['','linear','circular'], \ 814 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 815 assert line[51:52] == ' ', \ 816 'LOCUS line does not contain space at position 52:\n' + line 817 assert line[55:62] == ' ', \ 818 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 819 if line[62:73].strip() : 820 assert line[64:65] == '-', \ 821 'LOCUS line does not contain - at position 65 in date:\n' + line 822 assert line[68:69] == '-', \ 823 'LOCUS line does not contain - at position 69 in date:\n' + line 824 825 name_and_length_str = line[GENBANK_INDENT:29] 826 while name_and_length_str.find(' ')!=-1 : 827 name_and_length_str = name_and_length_str.replace(' ',' ') 828 name_and_length = name_and_length_str.split(' ') 829 assert len(name_and_length)<=2, \ 830 'Cannot parse the name and length in the LOCUS line:\n' + line 831 assert len(name_and_length)!=1, \ 832 'Name and length collide in the LOCUS line:\n' + line 833 #Should be possible to split them based on position, if 834 #a clear definition of the standard exists THAT AGREES with 835 #existing files. 836 consumer.locus(name_and_length[0]) 837 consumer.size(name_and_length[1]) 838 #consumer.residue_type(line[33:41].strip()) 839 840 if line[33:51].strip() == "" and line[29:33] == ' aa ' : 841 #Amino acids -> protein (even if there is no residue type given) 842 #We want to use a protein alphabet in this case, rather than a 843 #generic one. Not sure if this is the best way to achieve this, 844 #but it works because the scanner checks for this: 845 consumer.residue_type("PROTEIN") 846 else : 847 consumer.residue_type(line[33:51].strip()) 848 849 consumer.data_file_division(line[52:55]) 850 if line[62:73].strip() : 851 consumer.date(line[62:73]) 852 elif line[40:44] in [' bp ', ' aa ',' rc '] : 853 #New... 854 # 855 # Positions Contents 856 # --------- -------- 857 # 00:06 LOCUS 858 # 06:12 spaces 859 # 12:?? Locus name 860 # ??:?? space 861 # ??:40 Length of sequence, right-justified 862 # 40:44 space, bp, space 863 # 44:47 Blank, ss-, ds-, ms- 864 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 865 # 54:55 space 866 # 55:63 Blank (implies linear), linear or circular 867 # 63:64 space 868 # 64:67 The division code (e.g. BCT, VRL, INV) 869 # 67:68 space 870 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 871 # 872 assert line[40:44] in [' bp ', ' aa ',' rc '] , \ 873 'LOCUS line does not contain size units at expected position:\n' + line 874 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 875 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 876 assert line[47:54].strip() == "" \ 877 or line[47:54].strip().find('DNA') != -1 \ 878 or line[47:54].strip().find('RNA') != -1, \ 879 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 880 assert line[54:55] == ' ', \ 881 'LOCUS line does not contain space at position 55:\n' + line 882 assert line[55:63].strip() in ['','linear','circular'], \ 883 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 884 assert line[63:64] == ' ', \ 885 'LOCUS line does not contain space at position 64:\n' + line 886 assert line[67:68] == ' ', \ 887 'LOCUS line does not contain space at position 68:\n' + line 888 if line[68:79].strip() : 889 assert line[70:71] == '-', \ 890 'LOCUS line does not contain - at position 71 in date:\n' + line 891 assert line[74:75] == '-', \ 892 'LOCUS line does not contain - at position 75 in date:\n' + line 893 894 name_and_length_str = line[GENBANK_INDENT:40] 895 while name_and_length_str.find(' ')!=-1 : 896 name_and_length_str = name_and_length_str.replace(' ',' ') 897 name_and_length = name_and_length_str.split(' ') 898 assert len(name_and_length)<=2, \ 899 'Cannot parse the name and length in the LOCUS line:\n' + line 900 assert len(name_and_length)!=1, \ 901 'Name and length collide in the LOCUS line:\n' + line 902 #Should be possible to split them based on position, if 903 #a clear definition of the stand exists THAT AGREES with 904 #existing files. 905 consumer.locus(name_and_length[0]) 906 consumer.size(name_and_length[1]) 907 908 if line[44:54].strip() == "" and line[40:44] == ' aa ' : 909 #Amino acids -> protein (even if there is no residue type given) 910 #We want to use a protein alphabet in this case, rather than a 911 #generic one. Not sure if this is the best way to achieve this, 912 #but it works because the scanner checks for this: 913 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 914 else : 915 consumer.residue_type(line[44:63].strip()) 916 917 consumer.data_file_division(line[64:67]) 918 if line[68:79].strip() : 919 consumer.date(line[68:79]) 920 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 921 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 922 # 923 #e.g. 924 # 925 # "LOCUS U00096" 926 # 927 #rather than: 928 # 929 # "LOCUS U00096 4639675 bp DNA circular BCT" 930 # 931 # Positions Contents 932 # --------- -------- 933 # 00:06 LOCUS 934 # 06:12 spaces 935 # 12:?? Locus name 936 if line[GENBANK_INDENT:].strip() != "" : 937 consumer.locus(line[GENBANK_INDENT:].strip()) 938 else : 939 #Must just have just "LOCUS ", is this even legitimate? 940 #We should be able to continue parsing... we need real world testcases! 941 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line) 942 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"] : 943 #Cope with EMBOSS seqret output where it seems the locus id can cause 944 #the other fields to overflow. We just IGNORE the other fields! 945 consumer.locus(line.split()[1]) 946 consumer.size(line.split()[2]) 947 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line) 948 else : 949 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
950 951
952 - def _feed_header_lines(self, consumer, lines) :
953 #Following dictionary maps GenBank lines to the associated 954 #consumer methods - the special cases like LOCUS where one 955 #genbank line triggers several consumer calls have to be 956 #handled individually. 957 GENBANK_INDENT = self.HEADER_WIDTH 958 GENBANK_SPACER = " "*GENBANK_INDENT 959 consumer_dict = { 960 'DEFINITION' : 'definition', 961 'ACCESSION' : 'accession', 962 'NID' : 'nid', 963 'PID' : 'pid', 964 'DBSOURCE' : 'db_source', 965 'KEYWORDS' : 'keywords', 966 'SEGMENT' : 'segment', 967 'SOURCE' : 'source', 968 'AUTHORS' : 'authors', 969 'CONSRTM' : 'consrtm', 970 'PROJECT' : 'project', 971 'DBLINK' : 'dblink', 972 'TITLE' : 'title', 973 'JOURNAL' : 'journal', 974 'MEDLINE' : 'medline_id', 975 'PUBMED' : 'pubmed_id', 976 'REMARK' : 'remark'} 977 #We have to handle the following specially: 978 #ORIGIN (locus, size, residue_type, data_file_division and date) 979 #COMMENT (comment) 980 #VERSION (version and gi) 981 #REFERENCE (eference_num and reference_bases) 982 #ORGANISM (organism and taxonomy) 983 lines = filter(None,lines) 984 lines.append("") #helps avoid getting StopIteration all the time 985 line_iter = iter(lines) 986 try : 987 line = line_iter.next() 988 while True : 989 if not line : break 990 line_type = line[:GENBANK_INDENT].strip() 991 data = line[GENBANK_INDENT:].strip() 992 993 if line_type == 'VERSION' : 994 #Need to call consumer.version(), and maybe also consumer.gi() as well. 995 #e.g. 996 # VERSION AC007323.5 GI:6587720 997 while data.find(' ')!=-1: 998 data = data.replace(' ',' ') 999 if data.find(' GI:')==-1 : 1000 consumer.version(data) 1001 else : 1002 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 1003 consumer.version(data.split(' GI:')[0]) 1004 consumer.gi(data.split(' GI:')[1]) 1005 #Read in the next line! 1006 line = line_iter.next() 1007 elif line_type == 'REFERENCE' : 1008 if self.debug >1 : print "Found reference [" + data + "]" 1009 #Need to call consumer.reference_num() and consumer.reference_bases() 1010 #e.g. 1011 # REFERENCE 1 (bases 1 to 86436) 1012 # 1013 #Note that this can be multiline, see Bug 1968, e.g. 1014 # 1015 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1016 # 28259) 1017 # 1018 #For such cases we will call the consumer once only. 1019 data = data.strip() 1020 1021 #Read in the next line, and see if its more of the reference: 1022 while True: 1023 line = line_iter.next() 1024 if line[:GENBANK_INDENT] == GENBANK_SPACER : 1025 #Add this continuation to the data string 1026 data += " " + line[GENBANK_INDENT:] 1027 if self.debug >1 : print "Extended reference text [" + data + "]" 1028 else : 1029 #End of the reference, leave this text in the variable "line" 1030 break 1031 1032 #We now have all the reference line(s) stored in a string, data, 1033 #which we pass to the consumer 1034 while data.find(' ')!=-1: 1035 data = data.replace(' ',' ') 1036 if data.find(' ')==-1 : 1037 if self.debug >2 : print 'Reference number \"' + data + '\"' 1038 consumer.reference_num(data) 1039 else : 1040 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 1041 consumer.reference_num(data[:data.find(' ')]) 1042 consumer.reference_bases(data[data.find(' ')+1:]) 1043 elif line_type == 'ORGANISM' : 1044 #Typically the first line is the organism, and subsequent lines 1045 #are the taxonomy lineage. However, given longer and longer 1046 #species names (as more and more strains and sub strains get 1047 #sequenced) the oragnism name can now get wrapped onto multiple 1048 #lines. The NCBI say we have to recognise the lineage line by 1049 #the presense of semi-colon delimited entries. In the long term, 1050 #they are considering adding a new keyword (e.g. LINEAGE). 1051 #See Bug 2591 for details. 1052 organism_data = data 1053 lineage_data = "" 1054 while True : 1055 line = line_iter.next() 1056 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1057 if lineage_data or ";" in line : 1058 lineage_data += " " + line[GENBANK_INDENT:] 1059 else : 1060 organism_data += " " + line[GENBANK_INDENT:].strip() 1061 else : 1062 #End of organism and taxonomy 1063 break 1064 consumer.organism(organism_data) 1065 if lineage_data.strip() == "" and self.debug > 1 : 1066 print "Taxonomy line(s) missing or blank" 1067 consumer.taxonomy(lineage_data.strip()) 1068 del organism_data, lineage_data 1069 elif line_type == 'COMMENT' : 1070 if self.debug > 1 : print "Found comment" 1071 #This can be multiline, and should call consumer.comment() once 1072 #with a list where each entry is a line. 1073 comment_list=[] 1074 comment_list.append(data) 1075 while True: 1076 line = line_iter.next() 1077 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1078 data = line[GENBANK_INDENT:] 1079 comment_list.append(data) 1080 if self.debug > 2 : print "Comment continuation [" + data + "]" 1081 else : 1082 #End of the comment 1083 break 1084 consumer.comment(comment_list) 1085 del comment_list 1086 elif line_type in consumer_dict : 1087 #Its a semi-automatic entry! 1088 #Now, this may be a multi line entry... 1089 while True : 1090 line = line_iter.next() 1091 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1092 data += ' ' + line[GENBANK_INDENT:] 1093 else : 1094 #We now have all the data for this entry: 1095 getattr(consumer, consumer_dict[line_type])(data) 1096 #End of continuation - return to top of loop! 1097 break 1098 else : 1099 if self.debug : 1100 print "Ignoring GenBank header line:\n" % line 1101 #Read in next line 1102 line = line_iter.next() 1103 except StopIteration : 1104 raise ValueError("Problem in header")
1105
1106 - def _feed_misc_lines(self, consumer, lines) :
1107 #Deals with a few misc lines between the features and the sequence 1108 GENBANK_INDENT = self.HEADER_WIDTH 1109 GENBANK_SPACER = " "*GENBANK_INDENT 1110 lines.append("") 1111 line_iter = iter(lines) 1112 try : 1113 for line in line_iter : 1114 if line.find('BASE COUNT')==0 : 1115 line = line[10:].strip() 1116 if line : 1117 if self.debug : print "base_count = " + line 1118 consumer.base_count(line) 1119 if line.find("ORIGIN")==0 : 1120 line = line[6:].strip() 1121 if line : 1122 if self.debug : print "origin_name = " + line 1123 consumer.origin_name(line) 1124 if line.find("WGS ")==0 : 1125 line = line[3:].strip() 1126 consumer.wgs(line) 1127 if line.find("WGS_SCAFLD")==0 : 1128 line = line[10:].strip() 1129 consumer.add_wgs_scafld(line) 1130 if line.find("CONTIG")==0 : 1131 line = line[6:].strip() 1132 contig_location = line 1133 while True : 1134 line = line_iter.next() 1135 if not line : 1136 break 1137 elif line[:GENBANK_INDENT]==GENBANK_SPACER : 1138 #Don't need to preseve the whitespace here. 1139 contig_location += line[GENBANK_INDENT:].rstrip() 1140 else: 1141 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1142 consumer.contig_location(contig_location) 1143 return 1144 except StopIteration : 1145 raise ValueError("Problem in misc lines before sequence")
1146 1147 if __name__ == "__main__" : 1148 from StringIO import StringIO 1149 1150 gbk_example = \ 1151 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1152 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1153 (AXL2) and Rev7p (REV7) genes, complete cds. 1154 ACCESSION U49845 1155 VERSION U49845.1 GI:1293613 1156 KEYWORDS . 1157 SOURCE Saccharomyces cerevisiae (baker's yeast) 1158 ORGANISM Saccharomyces cerevisiae 1159 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1160 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1161 REFERENCE 1 (bases 1 to 5028) 1162 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1163 TITLE Cloning and sequence of REV7, a gene whose function is required for 1164 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1165 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1166 PUBMED 7871890 1167 REFERENCE 2 (bases 1 to 5028) 1168 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1169 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1170 plasma membrane glycoprotein 1171 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1172 PUBMED 8846915 1173 REFERENCE 3 (bases 1 to 5028) 1174 AUTHORS Roemer,T. 1175 TITLE Direct Submission 1176 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1177 Haven, CT, USA 1178 FEATURES Location/Qualifiers 1179 source 1..5028 1180 /organism="Saccharomyces cerevisiae" 1181 /db_xref="taxon:4932" 1182 /chromosome="IX" 1183 /map="9" 1184 CDS <1..206 1185 /codon_start=3 1186 /product="TCP1-beta" 1187 /protein_id="AAA98665.1" 1188 /db_xref="GI:1293614" 1189 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1190 AEVLLRVDNIIRARPRTANRQHM" 1191 gene 687..3158 1192 /gene="AXL2" 1193 CDS 687..3158 1194 /gene="AXL2" 1195 /note="plasma membrane glycoprotein" 1196 /codon_start=1 1197 /function="required for axial budding pattern of S. 1198 cerevisiae" 1199 /product="Axl2p" 1200 /protein_id="AAA98666.1" 1201 /db_xref="GI:1293615" 1202 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1203 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1204 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1205 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1206 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1207 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1208 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1209 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1210 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1211 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1212 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1213 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1214 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1215 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1216 VDFSNKSNVNVGQVKDIHGRIPEML" 1217 gene complement(3300..4037) 1218 /gene="REV7" 1219 CDS complement(3300..4037) 1220 /gene="REV7" 1221 /codon_start=1 1222 /product="Rev7p" 1223 /protein_id="AAA98667.1" 1224 /db_xref="GI:1293616" 1225 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1226 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1227 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1228 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1229 LISGDDKILNGVYSQYEEGESIFGSLF" 1230 ORIGIN 1231 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1232 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1233 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1234 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1235 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1236 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1237 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1238 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1239 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1240 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1241 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1242 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1243 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1244 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1245 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1246 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1247 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1248 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1249 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1250 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1251 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1252 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1253 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1254 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1255 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1256 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1257 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1258 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1259 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1260 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1261 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1262 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1263 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1264 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1265 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1266 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1267 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1268 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1269 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1270 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1271 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1272 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1273 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1274 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1275 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1276 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1277 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1278 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1279 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1280 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1281 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1282 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1283 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1284 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1285 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1286 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1287 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1288 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1289 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1290 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1291 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1292 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1293 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1294 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1295 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1296 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1297 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1298 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1299 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1300 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1301 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1302 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1303 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1304 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1305 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1306 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1307 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1308 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1309 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1310 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1311 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1312 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1313 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1314 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1315 //""" 1316 1317 # GenBank format protein (aka GenPept) file from: 1318 # http://www.molecularevolution.org/resources/fileformats/ 1319 gbk_example2 = \ 1320 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1321 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1322 ACCESSION AAD51968 1323 VERSION AAD51968.1 GI:5805369 1324 DBSOURCE locus AF171097 accession AF171097.1 1325 KEYWORDS . 1326 SOURCE Yersinia enterocolitica 1327 ORGANISM Yersinia enterocolitica 1328 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1329 Enterobacteriaceae; Yersinia. 1330 REFERENCE 1 (residues 1 to 143) 1331 AUTHORS Revell,P.A. and Miller,V.L. 1332 TITLE A chromosomally encoded regulator is required for expression of the 1333 Yersinia enterocolitica inv gene and for virulence 1334 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1335 MEDLINE 20138369 1336 PUBMED 10672189 1337 REFERENCE 2 (residues 1 to 143) 1338 AUTHORS Revell,P.A. and Miller,V.L. 1339 TITLE Direct Submission 1340 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1341 University School of Medicine, Campus Box 8230, 660 South Euclid, 1342 St. Louis, MO 63110, USA 1343 COMMENT Method: conceptual translation. 1344 FEATURES Location/Qualifiers 1345 source 1..143 1346 /organism="Yersinia enterocolitica" 1347 /mol_type="unassigned DNA" 1348 /strain="JB580v" 1349 /serotype="O:8" 1350 /db_xref="taxon:630" 1351 Protein 1..143 1352 /product="transcriptional regulator RovA" 1353 /name="regulates inv expression" 1354 CDS 1..143 1355 /gene="rovA" 1356 /coded_by="AF171097.1:380..811" 1357 /note="regulator of virulence" 1358 /transl_table=11 1359 ORIGIN 1360 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1361 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1362 121 deiellsgli dklerniiql qsk 1363 // 1364 """ 1365 1366 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1367 XX 1368 AC X56734; S46826; 1369 XX 1370 DT 12-SEP-1991 (Rel. 29, Created) 1371 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1372 XX 1373 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1374 XX 1375 KW beta-glucosidase. 1376 XX 1377 OS Trifolium repens (white clover) 1378 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1379 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1380 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1381 XX 1382 RN [5] 1383 RP 1-1859 1384 RX PUBMED; 1907511. 1385 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1386 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1387 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1388 RL Plant Mol. Biol. 17(2):209-219(1991). 1389 XX 1390 RN [6] 1391 RP 1-1859 1392 RA Hughes M.A.; 1393 RT ; 1394 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1395 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1396 RL Upon Tyne, NE2 4HH, UK 1397 XX 1398 FH Key Location/Qualifiers 1399 FH 1400 FT source 1..1859 1401 FT /organism="Trifolium repens" 1402 FT /mol_type="mRNA" 1403 FT /clone_lib="lambda gt10" 1404 FT /clone="TRE361" 1405 FT /tissue_type="leaves" 1406 FT /db_xref="taxon:3899" 1407 FT CDS 14..1495 1408 FT /product="beta-glucosidase" 1409 FT /EC_number="3.2.1.21" 1410 FT /note="non-cyanogenic" 1411 FT /db_xref="GOA:P26204" 1412 FT /db_xref="InterPro:IPR001360" 1413 FT /db_xref="InterPro:IPR013781" 1414 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1415 FT /protein_id="CAA40058.1" 1416 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1417 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1418 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1419 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1420 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1421 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1422 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1423 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1424 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1425 FT mRNA 1..1859 1426 FT /experiment="experimental evidence, no additional details 1427 FT recorded" 1428 XX 1429 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1430 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1431 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1432 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1433 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1434 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1435 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1436 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1437 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1438 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1439 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1440 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1441 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1442 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1443 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1444 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1445 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1446 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1447 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1448 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1449 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1450 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1451 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1452 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1453 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1454 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1455 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1456 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1457 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1458 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1459 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1460 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1461 // 1462 """ 1463 1464 print "GenBank CDS Iteration" 1465 print "=====================" 1466 1467 g = GenBankScanner() 1468 for record in g.parse_cds_features(StringIO(gbk_example)) : 1469 print record 1470 1471 g = GenBankScanner() 1472 for record in g.parse_cds_features(StringIO(gbk_example2), 1473 tags2id=('gene','locus_tag','product')) : 1474 print record 1475 1476 g = GenBankScanner() 1477 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1478 tags2id=('gene','locus_tag','product')) : 1479 print record 1480 1481 print 1482 print "GenBank Iteration" 1483 print "=================" 1484 g = GenBankScanner() 1485 for record in g.parse_records(StringIO(gbk_example),do_features=False) : 1486 print record.id, record.name, record.description 1487 print record.seq 1488 1489 g = GenBankScanner() 1490 for record in g.parse_records(StringIO(gbk_example),do_features=True) : 1491 print record.id, record.name, record.description 1492 print record.seq 1493 1494 g = GenBankScanner() 1495 for record in g.parse_records(StringIO(gbk_example2),do_features=False) : 1496 print record.id, record.name, record.description 1497 print record.seq 1498 1499 g = GenBankScanner() 1500 for record in g.parse_records(StringIO(gbk_example2),do_features=True) : 1501 print record.id, record.name, record.description 1502 print record.seq 1503 1504 print 1505 print "EMBL CDS Iteration" 1506 print "==================" 1507 1508 e = EmblScanner() 1509 for record in e.parse_cds_features(StringIO(embl_example)) : 1510 print record 1511 1512 print 1513 print "EMBL Iteration" 1514 print "==============" 1515 e = EmblScanner() 1516 for record in e.parse_records(StringIO(embl_example),do_features=True) : 1517 print record.id, record.name, record.description 1518 print record.seq 1519