Package Bio :: Package SwissProt :: Module SProt
[hide private]
[frames] | no frames]

Source Code for Module Bio.SwissProt.SProt

   1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """ 
   7  This module provides code to work with the sprotXX.dat file from 
   8  SwissProt. 
   9  http://www.expasy.ch/sprot/sprot-top.html 
  10   
  11  Tested with: 
  12  Release 37, Release 38, Release 39 
  13   
  14  Limited testing with: 
  15  Release 51, 54 
  16   
  17   
  18  Classes: 
  19  Record             Holds SwissProt data. 
  20  Reference          Holds reference data from a SwissProt entry. 
  21  Iterator           Iterates over entries in a SwissProt file. 
  22  Dictionary         Accesses a SwissProt file using a dictionary interface. 
  23  RecordParser       Parses a SwissProt record into a Record object. 
  24  SequenceParser     Parses a SwissProt record into a SeqRecord object. 
  25   
  26  _Scanner           Scans SwissProt-formatted data. 
  27  _RecordConsumer    Consumes SwissProt data to a SProt.Record object. 
  28  _SequenceConsumer  Consumes SwissProt data to a SeqRecord object. 
  29   
  30   
  31  Functions: 
  32  index_file         Index a SwissProt file for a Dictionary. 
  33   
  34  """ 
  35  from types import * 
  36  import os 
  37  from Bio import File 
  38  from Bio import Index 
  39  from Bio import Alphabet 
  40  from Bio import Seq 
  41  from Bio import SeqRecord 
  42  from Bio.ParserSupport import * 
  43   
  44  _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation 
  45   
46 -class Record:
47 """Holds information from a SwissProt record. 48 49 Members: 50 entry_name Name of this entry, e.g. RL1_ECOLI. 51 data_class Either 'STANDARD' or 'PRELIMINARY'. 52 molecule_type Type of molecule, 'PRT', 53 sequence_length Number of residues. 54 55 accessions List of the accession numbers, e.g. ['P00321'] 56 created A tuple of (date, release). 57 sequence_update A tuple of (date, release). 58 annotation_update A tuple of (date, release). 59 60 description Free-format description. 61 gene_name Gene name. See userman.txt for description. 62 organism The source of the sequence. 63 organelle The origin of the sequence. 64 organism_classification The taxonomy classification. List of strings. 65 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 66 taxonomy_id A list of NCBI taxonomy id's. 67 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 68 if any. 69 references List of Reference objects. 70 comments List of strings. 71 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 72 keywords List of the keywords. 73 features List of tuples (key name, from, to, description). 74 from and to can be either integers for the residue 75 numbers, '<', '>', or '?' 76 77 seqinfo tuple of (length, molecular weight, CRC32 value) 78 sequence The sequence. 79 80 """
81 - def __init__(self):
82 self.entry_name = None 83 self.data_class = None 84 self.molecule_type = None 85 self.sequence_length = None 86 87 self.accessions = [] 88 self.created = None 89 self.sequence_update = None 90 self.annotation_update = None 91 92 self.description = '' 93 self.gene_name = '' 94 self.organism = '' 95 self.organelle = '' 96 self.organism_classification = [] 97 self.taxonomy_id = [] 98 self.host_organism = [] 99 self.references = [] 100 self.comments = [] 101 self.cross_references = [] 102 self.keywords = [] 103 self.features = [] 104 105 self.seqinfo = None 106 self.sequence = ''
107
108 -class Reference:
109 """Holds information from 1 references in a SwissProt entry. 110 111 Members: 112 number Number of reference in an entry. 113 positions Describes extent of work. list of strings. 114 comments Comments. List of (token, text). 115 references References. List of (dbname, identifier) 116 authors The authors of the work. 117 title Title of the work. 118 location A citation for the work. 119 120 """
121 - def __init__(self):
122 self.number = None 123 self.positions = [] 124 self.comments = [] 125 self.references = [] 126 self.authors = '' 127 self.title = '' 128 self.location = ''
129
130 -class Iterator:
131 """Returns one record at a time from a SwissProt file. 132 133 Methods: 134 next Return the next record from the stream, or None. 135 136 """
137 - def __init__(self, handle, parser=None):
138 """__init__(self, handle, parser=None) 139 140 Create a new iterator. handle is a file-like object. parser 141 is an optional Parser object to change the results into another form. 142 If set to None, then the raw contents of the file will be returned. 143 144 """ 145 import warnings 146 warnings.warn("Bio.SwissProt.SProt.Iterator is deprecated. Please use the function Bio.SwissProt.parse instead if you want to get a SwissProt.SProt.Record, or Bio.SeqIO.parse if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).", 147 DeprecationWarning) 148 149 if type(handle) is not FileType and type(handle) is not InstanceType: 150 raise ValueError, "I expected a file handle or file-like object" 151 self._uhandle = File.UndoHandle(handle) 152 self._parser = parser
153
154 - def next(self):
155 """next(self) -> object 156 157 Return the next swissprot record from the file. If no more records, 158 return None. 159 160 """ 161 lines = [] 162 while 1: 163 line = self._uhandle.readline() 164 if not line: 165 break 166 lines.append(line) 167 if line[:2] == '//': 168 break 169 170 if not lines: 171 return None 172 173 data = ''.join(lines) 174 if self._parser is not None: 175 return self._parser.parse(File.StringHandle(data)) 176 return data
177
178 - def __iter__(self):
179 return iter(self.next, None)
180
181 -class Dictionary:
182 """Accesses a SwissProt file using a dictionary interface. 183 184 """ 185 __filename_key = '__filename' 186
187 - def __init__(self, indexname, parser=None):
188 """__init__(self, indexname, parser=None) 189 190 Open a SwissProt Dictionary. indexname is the name of the 191 index for the dictionary. The index should have been created 192 using the index_file function. parser is an optional Parser 193 object to change the results into another form. If set to None, 194 then the raw contents of the file will be returned. 195 196 """ 197 self._index = Index.Index(indexname) 198 self._handle = open(self._index[self.__filename_key]) 199 self._parser = parser
200
201 - def __len__(self):
202 return len(self._index)
203
204 - def __getitem__(self, key):
205 start, len = self._index[key] 206 self._handle.seek(start) 207 data = self._handle.read(len) 208 if self._parser is not None: 209 return self._parser.parse(File.StringHandle(data)) 210 return data
211
212 - def __getattr__(self, name):
213 return getattr(self._index, name)
214
215 - def keys(self):
216 # I only want to expose the keys for SwissProt. 217 k = self._index.keys() 218 k.remove(self.__filename_key) 219 return k
220
221 -class ExPASyDictionary:
222 """Access SwissProt at ExPASy using a read-only dictionary interface. 223 224 """
225 - def __init__(self, delay=5.0, parser=None):
226 """__init__(self, delay=5.0, parser=None) 227 228 Create a new Dictionary to access SwissProt. parser is an optional 229 parser (e.g. SProt.RecordParser) object to change the results 230 into another form. If set to None, then the raw contents of the 231 file will be returned. delay is the number of seconds to wait 232 between each query. 233 234 """ 235 import warnings 236 from Bio.WWW import RequestLimiter 237 warnings.warn("Bio.SwissProt.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_sprot_raw instead.", 238 DeprecationWarning) 239 self.parser = parser 240 self.limiter = RequestLimiter(delay)
241
242 - def __len__(self):
243 raise NotImplementedError, "SwissProt contains lots of entries"
244 - def clear(self):
245 raise NotImplementedError, "This is a read-only dictionary"
246 - def __setitem__(self, key, item):
247 raise NotImplementedError, "This is a read-only dictionary"
248 - def update(self):
249 raise NotImplementedError, "This is a read-only dictionary"
250 - def copy(self):
251 raise NotImplementedError, "You don't need to do this..."
252 - def keys(self):
253 raise NotImplementedError, "You don't really want to do this..."
254 - def items(self):
255 raise NotImplementedError, "You don't really want to do this..."
256 - def values(self):
257 raise NotImplementedError, "You don't really want to do this..."
258
259 - def has_key(self, id):
260 """has_key(self, id) -> bool""" 261 try: 262 self[id] 263 except KeyError: 264 return 0 265 return 1
266
267 - def get(self, id, failobj=None):
268 try: 269 return self[id] 270 except KeyError: 271 return failobj 272 raise "How did I get here?"
273
274 - def __getitem__(self, id):
275 """__getitem__(self, id) -> object 276 277 Return a SwissProt entry. id is either the id or accession 278 for the entry. Raises a KeyError if there's an error. 279 280 """ 281 from Bio.WWW import ExPASy 282 # First, check to see if enough time has passed since my 283 # last query. 284 self.limiter.wait() 285 286 try: 287 handle = ExPASy.get_sprot_raw(id) 288 except IOError: 289 raise KeyError, id 290 291 if self.parser is not None: 292 return self.parser.parse(handle) 293 return handle.read()
294
295 -class RecordParser(AbstractParser):
296 """Parses SwissProt data into a Record object. 297 298 """
299 - def __init__(self):
300 self._scanner = _Scanner() 301 self._consumer = _RecordConsumer()
302
303 - def parse(self, handle):
304 self._scanner.feed(handle, self._consumer) 305 return self._consumer.data
306
307 -class SequenceParser(AbstractParser):
308 """Parses SwissProt data into a standard SeqRecord object. 309 """
310 - def __init__(self, alphabet = Alphabet.generic_protein):
311 """Initialize a SequenceParser. 312 313 Arguments: 314 o alphabet - The alphabet to use for the generated Seq objects. If 315 not supplied this will default to the generic protein alphabet. 316 """ 317 self._scanner = _Scanner() 318 self._consumer = _SequenceConsumer(alphabet)
319
320 - def parse(self, handle):
321 self._scanner.feed(handle, self._consumer) 322 return self._consumer.data
323
324 -class _Scanner:
325 """Scans SwissProt-formatted data. 326 327 Tested with: 328 Release 37 329 Release 38 330 """ 331
332 - def feed(self, handle, consumer):
333 """feed(self, handle, consumer) 334 335 Feed in SwissProt data for scanning. handle is a file-like 336 object that contains swissprot data. consumer is a 337 Consumer object that will receive events as the report is scanned. 338 339 """ 340 if isinstance(handle, File.UndoHandle): 341 uhandle = handle 342 else: 343 uhandle = File.UndoHandle(handle) 344 345 self._scan_record(uhandle, consumer)
346
347 - def _skip_starstar(self, uhandle) :
348 """Ignores any lines starting **""" 349 #See Bug 2353, some files from the EBI have extra lines 350 #starting "**" (two asterisks/stars), usually between the 351 #features and sequence but not all the time. They appear 352 #to be unofficial automated annotations. e.g. 353 #** 354 #** ################# INTERNAL SECTION ################## 355 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 356 while "**" == uhandle.peekline()[:2] : 357 skip = uhandle.readline()
358 #print "Skipping line: %s" % skip.rstrip() 359
360 - def _scan_record(self, uhandle, consumer):
361 consumer.start_record() 362 for fn in self._scan_fns: 363 self._skip_starstar(uhandle) 364 fn(self, uhandle, consumer) 365 366 # In Release 38, ID N33_HUMAN has a DR buried within comments. 367 # Check for this and do more comments, if necessary. 368 # XXX handle this better 369 if fn is self._scan_dr.im_func: 370 self._scan_cc(uhandle, consumer) 371 self._scan_dr(uhandle, consumer) 372 consumer.end_record()
373
374 - def _scan_line(self, line_type, uhandle, event_fn, 375 exactly_one=None, one_or_more=None, any_number=None, 376 up_to_one=None):
377 # Callers must set exactly one of exactly_one, one_or_more, or 378 # any_number to a true value. I do not explicitly check to 379 # make sure this function is called correctly. 380 381 # This does not guarantee any parameter safety, but I 382 # like the readability. The other strategy I tried was have 383 # parameters min_lines, max_lines. 384 385 if exactly_one or one_or_more: 386 read_and_call(uhandle, event_fn, start=line_type) 387 if one_or_more or any_number: 388 while 1: 389 if not attempt_read_and_call(uhandle, event_fn, 390 start=line_type): 391 break 392 if up_to_one: 393 attempt_read_and_call(uhandle, event_fn, start=line_type)
394
395 - def _scan_id(self, uhandle, consumer):
396 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
397
398 - def _scan_ac(self, uhandle, consumer):
399 # Until release 38, this used to match exactly_one. 400 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the 401 # definition needed to be expanded. 402 self._scan_line('AC', uhandle, consumer.accession, any_number=1)
403
404 - def _scan_dt(self, uhandle, consumer):
405 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 406 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 407 # IPI doesn't necessarily contain the third line about annotations 408 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
409
410 - def _scan_de(self, uhandle, consumer):
411 # IPI can be missing a DE line 412 self._scan_line('DE', uhandle, consumer.description, any_number=1)
413
414 - def _scan_gn(self, uhandle, consumer):
415 self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
416
417 - def _scan_os(self, uhandle, consumer):
418 self._scan_line('OS', uhandle, consumer.organism_species, 419 one_or_more=1)
420
421 - def _scan_og(self, uhandle, consumer):
422 self._scan_line('OG', uhandle, consumer.organelle, any_number=1)
423
424 - def _scan_oc(self, uhandle, consumer):
425 self._scan_line('OC', uhandle, consumer.organism_classification, 426 one_or_more=1)
427
428 - def _scan_ox(self, uhandle, consumer):
429 self._scan_line('OX', uhandle, consumer.taxonomy_id, 430 any_number=1)
431
432 - def _scan_oh(self, uhandle, consumer):
433 # viral host organism. introduced after SwissProt 39. 434 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)
435
436 - def _scan_reference(self, uhandle, consumer):
437 while 1: 438 if safe_peekline(uhandle)[:2] != 'RN': 439 break 440 self._scan_rn(uhandle, consumer) 441 self._scan_rp(uhandle, consumer) 442 self._scan_rc(uhandle, consumer) 443 self._scan_rx(uhandle, consumer) 444 # ws:2001-12-05 added, for record with RL before RA 445 self._scan_rl(uhandle, consumer) 446 self._scan_ra(uhandle, consumer) 447 #EBI copy of P72010 is missing the RT line, and has one 448 #of their ** lines in its place noting "** /NO TITLE." 449 #See also bug 2353 450 self._skip_starstar(uhandle) 451 self._scan_rt(uhandle, consumer) 452 self._scan_rl(uhandle, consumer)
453
454 - def _scan_rn(self, uhandle, consumer):
455 self._scan_line('RN', uhandle, consumer.reference_number, 456 exactly_one=1)
457
458 - def _scan_rp(self, uhandle, consumer):
459 self._scan_line('RP', uhandle, consumer.reference_position, 460 one_or_more=1)
461
462 - def _scan_rc(self, uhandle, consumer):
463 self._scan_line('RC', uhandle, consumer.reference_comment, 464 any_number=1)
465
466 - def _scan_rx(self, uhandle, consumer):
467 self._scan_line('RX', uhandle, consumer.reference_cross_reference, 468 any_number=1)
469
470 - def _scan_ra(self, uhandle, consumer):
471 # In UniProt release 1.12 of 6/21/04, there is a new RG 472 # (Reference Group) line, which references a group instead of 473 # an author. Each block must have at least 1 RA or RG line. 474 self._scan_line('RA', uhandle, consumer.reference_author, 475 any_number=1) 476 self._scan_line('RG', uhandle, consumer.reference_author, 477 any_number=1) 478 # PRKN_HUMAN has RG lines, then RA lines. The best solution 479 # is to write code that accepts either of the line types. 480 # This is the quick solution... 481 self._scan_line('RA', uhandle, consumer.reference_author, 482 any_number=1)
483
484 - def _scan_rt(self, uhandle, consumer):
485 self._scan_line('RT', uhandle, consumer.reference_title, 486 any_number=1)
487
488 - def _scan_rl(self, uhandle, consumer):
489 # This was one_or_more, but P82909 in TrEMBL 16.0 does not 490 # have one. 491 self._scan_line('RL', uhandle, consumer.reference_location, 492 any_number=1)
493
494 - def _scan_cc(self, uhandle, consumer):
495 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
496
497 - def _scan_dr(self, uhandle, consumer):
498 self._scan_line('DR', uhandle, consumer.database_cross_reference, 499 any_number=1)
500
501 - def _scan_kw(self, uhandle, consumer):
502 self._scan_line('KW', uhandle, consumer.keyword, any_number=1)
503
504 - def _scan_ft(self, uhandle, consumer):
505 self._scan_line('FT', uhandle, consumer.feature_table, any_number=1)
506
507 - def _scan_pe(self, uhandle, consumer):
508 self._scan_line('PE', uhandle, consumer.protein_existence, any_number=1)
509
510 - def _scan_sq(self, uhandle, consumer):
511 self._scan_line('SQ', uhandle, consumer.sequence_header, exactly_one=1)
512
513 - def _scan_sequence_data(self, uhandle, consumer):
514 self._scan_line(' ', uhandle, consumer.sequence_data, one_or_more=1)
515
516 - def _scan_terminator(self, uhandle, consumer):
517 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
518 519 _scan_fns = [ 520 _scan_id, 521 _scan_ac, 522 _scan_dt, 523 _scan_de, 524 _scan_gn, 525 _scan_os, 526 _scan_og, 527 _scan_oc, 528 _scan_ox, 529 _scan_oh, 530 _scan_reference, 531 _scan_cc, 532 _scan_dr, 533 _scan_pe, 534 _scan_kw, 535 _scan_ft, 536 _scan_sq, 537 _scan_sequence_data, 538 _scan_terminator 539 ]
540
541 -class _RecordConsumer(AbstractConsumer):
542 """Consumer that converts a SwissProt record to a Record object. 543 544 Members: 545 data Record with SwissProt data. 546 547 """
548 - def __init__(self):
549 self.data = None
550
551 - def __repr__(self) :
552 return "Bio.SwissProt.SProt._RecordConsumer()"
553
554 - def start_record(self):
555 self.data = Record() 556 self._sequence_lines = []
557
558 - def end_record(self):
559 self._clean_record(self.data) 560 self.data.sequence = "".join(self._sequence_lines)
561
562 - def identification(self, line):
563 cols = line.split() 564 #Prior to release 51, included with MoleculeType: 565 #ID EntryName DataClass; MoleculeType; SequenceLength. 566 # 567 #Newer files lack the MoleculeType: 568 #ID EntryName DataClass; SequenceLength. 569 # 570 #Note that cols is split on white space, so the length 571 #should become two fields (number and units) 572 if len(cols) == 6 : 573 self.data.entry_name = cols[1] 574 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 575 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' 576 self.data.sequence_length = int(cols[4]) 577 elif len(cols) == 5 : 578 self.data.entry_name = cols[1] 579 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 580 self.data.molecule_type = None 581 self.data.sequence_length = int(cols[3]) 582 else : 583 #Should we print a warning an continue? 584 raise ValueError("ID line has unrecognised format:\n"+line) 585 586 # data class can be 'STANDARD' or 'PRELIMINARY' 587 # ws:2001-12-05 added IPI 588 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' 589 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 590 'Reviewed', 'Unreviewed']: 591 raise ValueError, "Unrecognized data class %s in line\n%s" % \ 592 (self.data.data_class, line) 593 # molecule_type should be 'PRT' for PRoTein 594 # Note that has been removed in recent releases (set to None) 595 if self.data.molecule_type is not None \ 596 and self.data.molecule_type != 'PRT': 597 raise ValueError, "Unrecognized molecule type %s in line\n%s" % \ 598 (self.data.molecule_type, line)
599
600 - def accession(self, line):
601 cols = line[5:].rstrip(_CHOMP).strip().split(';') 602 for ac in cols: 603 if ac.strip() : 604 #remove any leading or trailing white space 605 self.data.accessions.append(ac.strip())
606
607 - def date(self, line):
608 uprline = string.upper(line) 609 cols = line.rstrip().split() 610 611 if uprline.find('CREATED') >= 0 \ 612 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ 613 or uprline.find('LAST ANNOTATION UPDATE') >= 0: 614 # Old style DT line 615 # ================= 616 # e.g. 617 # DT 01-FEB-1995 (Rel. 31, Created) 618 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 619 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 620 # 621 # or: 622 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 623 # ... 624 625 # find where the version information will be located 626 # This is needed for when you have cases like IPI where 627 # the release verison is in a different spot: 628 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 629 uprcols = uprline.split() 630 rel_index = -1 631 for index in range(len(uprcols)): 632 if uprcols[index].find("REL.") >= 0: 633 rel_index = index 634 assert rel_index >= 0, \ 635 "Could not find Rel. in DT line: %s" % (line) 636 version_index = rel_index + 1 637 # get the version information 638 str_version = cols[version_index].rstrip(_CHOMP) 639 # no version number 640 if str_version == '': 641 version = 0 642 # dot versioned 643 elif str_version.find(".") >= 0: 644 version = str_version 645 # integer versioned 646 else: 647 version = int(str_version) 648 649 if uprline.find('CREATED') >= 0: 650 self.data.created = cols[1], version 651 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 652 self.data.sequence_update = cols[1], version 653 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: 654 self.data.annotation_update = cols[1], version 655 else: 656 assert False, "Shouldn't reach this line!" 657 elif uprline.find('INTEGRATED INTO') >= 0 \ 658 or uprline.find('SEQUENCE VERSION') >= 0 \ 659 or uprline.find('ENTRY VERSION') >= 0: 660 # New style DT line 661 # ================= 662 # As of UniProt Knowledgebase release 7.0 (including 663 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 664 # format of the DT lines and the version information 665 # in them was changed - the release number was dropped. 666 # 667 # For more information see bug 1948 and 668 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 669 # 670 # e.g. 671 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 672 # DT 15-OCT-2001, sequence version 3. 673 # DT 01-APR-2004, entry version 14. 674 # 675 #This is a new style DT line... 676 677 # The date should be in string cols[1] 678 # Get the version number if there is one. 679 # For the three DT lines above: 0, 3, 14 680 try: 681 version = int(cols[-1]) 682 except ValueError : 683 version = 0 684 685 # Re-use the historical property names, even though 686 # the meaning has changed slighty: 687 if uprline.find("INTEGRATED") >= 0: 688 self.data.created = cols[1], version 689 elif uprline.find('SEQUENCE VERSION') >= 0: 690 self.data.sequence_update = cols[1], version 691 elif uprline.find( 'ENTRY VERSION') >= 0: 692 self.data.annotation_update = cols[1], version 693 else: 694 assert False, "Shouldn't reach this line!" 695 else: 696 raise ValueError, "I don't understand the date line %s" % line
697
698 - def description(self, line):
699 self.data.description += line[5:]
700
701 - def gene_name(self, line):
702 self.data.gene_name += line[5:]
703
704 - def organism_species(self, line):
705 self.data.organism += line[5:]
706
707 - def organelle(self, line):
708 self.data.organelle += line[5:]
709
710 - def organism_classification(self, line):
711 line = line[5:].rstrip(_CHOMP) 712 cols = line.split(';') 713 for col in cols: 714 self.data.organism_classification.append(col.lstrip())
715
716 - def taxonomy_id(self, line):
717 # The OX line is in the format: 718 # OX DESCRIPTION=ID[, ID]...; 719 # If there are too many id's to fit onto a line, then the ID's 720 # continue directly onto the next line, e.g. 721 # OX DESCRIPTION=ID[, ID]... 722 # OX ID[, ID]...; 723 # Currently, the description is always "NCBI_TaxID". 724 725 # To parse this, I need to check to see whether I'm at the 726 # first line. If I am, grab the description and make sure 727 # it's an NCBI ID. Then, grab all the id's. 728 line = line[5:].rstrip(_CHOMP) 729 index = line.find('=') 730 if index >= 0: 731 descr = line[:index] 732 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 733 ids = line[index+1:].split(',') 734 else: 735 ids = line.split(',') 736 self.data.taxonomy_id.extend([id.strip() for id in ids])
737
738 - def organism_host(self, line):
739 # Line type OH (Organism Host) for viral hosts 740 # same code as in taxonomy_id() 741 line = line[5:].rstrip(_CHOMP) 742 index = line.find('=') 743 if index >= 0: 744 descr = line[:index] 745 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 746 ids = line[index+1:].split(',') 747 else: 748 ids = line.split(',') 749 self.data.host_organism.extend([id.strip() for id in ids])
750
751 - def reference_number(self, line):
752 rn = line[5:].rstrip() 753 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 754 ref = Reference() 755 ref.number = int(rn[1:-1]) 756 self.data.references.append(ref)
757
758 - def reference_position(self, line):
759 assert self.data.references, "RP: missing RN" 760 self.data.references[-1].positions.append(line[5:].rstrip())
761
762 - def reference_comment(self, line):
763 assert self.data.references, "RC: missing RN" 764 cols = line[5:].rstrip().split( ';') 765 ref = self.data.references[-1] 766 for col in cols: 767 if not col: # last column will be the empty string 768 continue 769 # The token is everything before the first '=' character. 770 index = col.find('=') 771 token, text = col[:index], col[index+1:] 772 # According to the spec, there should only be 1 '=' 773 # character. However, there are too many exceptions to 774 # handle, so we'll ease up and allow anything after the 775 # first '='. 776 #if col == ' STRAIN=TISSUE=BRAIN': 777 # # from CSP_MOUSE, release 38 778 # token, text = "TISSUE", "BRAIN" 779 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': 780 # # from NDOA_PSEPU, release 38 781 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" 782 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ 783 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': 784 # # from NU3M_BALPH, release 38, release 39 785 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" 786 #else: 787 # token, text = string.split(col, '=') 788 ref.comments.append((token.lstrip(), text))
789
790 - def reference_cross_reference(self, line):
791 assert self.data.references, "RX: missing RN" 792 # The basic (older?) RX line is of the form: 793 # RX MEDLINE; 85132727. 794 # but there are variants of this that need to be dealt with (see below) 795 796 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 797 # have extraneous information in the RX line. Check for 798 # this and chop it out of the line. 799 # (noticed by katel@worldpath.net) 800 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 801 if ind >= 0: 802 line = line[:ind] 803 804 # RX lines can also be used of the form 805 # RX PubMed=9603189; 806 # reported by edvard@farmasi.uit.no 807 # and these can be more complicated like: 808 # RX MEDLINE=95385798; PubMed=7656980; 809 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 810 # We look for these cases first and deal with them 811 if line.find("=") != -1: 812 cols = line[2:].split("; ") 813 cols = [x.strip() for x in cols] 814 cols = [x for x in cols if x] 815 for col in cols: 816 x = col.split("=") 817 assert len(x) == 2, "I don't understand RX line %s" % line 818 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 819 ref = self.data.references[-1].references 820 ref.append((key, value)) 821 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 822 else: 823 cols = line.split() 824 # normally we split into the three parts 825 assert len(cols) == 3, "I don't understand RX line %s" % line 826 self.data.references[-1].references.append( 827 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
828
829 - def reference_author(self, line):
830 assert self.data.references, "RA: missing RN" 831 ref = self.data.references[-1] 832 ref.authors += line[5:]
833
834 - def reference_title(self, line):
835 assert self.data.references, "RT: missing RN" 836 ref = self.data.references[-1] 837 ref.title += line[5:]
838
839 - def reference_location(self, line):
840 assert self.data.references, "RL: missing RN" 841 ref = self.data.references[-1] 842 ref.location += line[5:]
843
844 - def comment(self, line):
845 if line[5:8] == '-!-': # Make a new comment 846 self.data.comments.append(line[9:]) 847 elif line[5:8] == ' ': # add to the previous comment 848 if not self.data.comments: 849 # TCMO_STRGA in Release 37 has comment with no topic 850 self.data.comments.append(line[9:]) 851 else: 852 self.data.comments[-1] += line[9:] 853 elif line[5:8] == '---': 854 # If there are no comments, and it's not the closing line, 855 # make a new comment. 856 if not self.data.comments or self.data.comments[-1][:3] != '---': 857 self.data.comments.append(line[5:]) 858 else: 859 self.data.comments[-1] += line[5:] 860 else: # copyright notice 861 self.data.comments[-1] += line[5:]
862
863 - def database_cross_reference(self, line):
864 # From CLD1_HUMAN, Release 39: 865 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] 866 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai 867 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. 868 line = line[5:] 869 # Remove the comments at the end of the line 870 i = line.find('[') 871 if i >= 0: 872 line = line[:i] 873 cols = line.rstrip(_CHOMP).split(';') 874 cols = [col.lstrip() for col in cols] 875 self.data.cross_references.append(tuple(cols))
876
877 - def keyword(self, line):
878 cols = line[5:].rstrip(_CHOMP).split(';') 879 self.data.keywords.extend([c.lstrip() for c in cols])
880
881 - def feature_table(self, line):
882 line = line[5:] # get rid of junk in front 883 name = line[0:8].rstrip() 884 try: 885 from_res = int(line[9:15]) 886 except ValueError: 887 from_res = line[9:15].lstrip() 888 try: 889 to_res = int(line[16:22]) 890 except ValueError: 891 to_res = line[16:22].lstrip() 892 description = line[29:70].rstrip() 893 #if there is a feature_id (FTId), store it away 894 if line[29:35]==r"/FTId=": 895 ft_id = line[35:70].rstrip()[:-1] 896 else: 897 ft_id ="" 898 if not name: # is continuation of last one 899 assert not from_res and not to_res 900 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] 901 del self.data.features[-1] 902 description = "%s %s" % (old_description, description) 903 904 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 905 if name == "VARSPLIC": 906 description = self._fix_varsplic_sequences(description) 907 self.data.features.append((name, from_res, to_res, description,ft_id))
908
909 - def _fix_varsplic_sequences(self, description):
910 """Remove unwanted spaces in sequences. 911 912 During line carryover, the sequences in VARSPLIC can get mangled 913 with unwanted spaces like: 914 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 915 We want to check for this case and correct it as it happens. 916 """ 917 descr_cols = description.split(" -> ") 918 if len(descr_cols) == 2: 919 first_seq = descr_cols[0] 920 second_seq = descr_cols[1] 921 extra_info = '' 922 # we might have more information at the end of the 923 # second sequence, which should be in parenthesis 924 extra_info_pos = second_seq.find(" (") 925 if extra_info_pos != -1: 926 extra_info = second_seq[extra_info_pos:] 927 second_seq = second_seq[:extra_info_pos] 928 929 # now clean spaces out of the first and second string 930 first_seq = first_seq.replace(" ", "") 931 second_seq = second_seq.replace(" ", "") 932 933 # reassemble the description 934 description = first_seq + " -> " + second_seq + extra_info 935 936 return description
937
938 - def protein_existence(self, line):
939 #TODO - Record this information? 940 pass
941
942 - def sequence_header(self, line):
943 cols = line.split() 944 assert len(cols) == 8, "I don't understand SQ line %s" % line 945 # Do more checking here? 946 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
947
948 - def sequence_data(self, line):
949 #It should be faster to make a list of strings, and join them at the end. 950 self._sequence_lines.append(line.replace(" ", "").rstrip())
951
952 - def terminator(self, line):
953 pass
954 955 #def _clean(self, line, rstrip=1): 956 # if rstrip: 957 # return string.rstrip(line[5:]) 958 # return line[5:] 959
960 - def _clean_record(self, rec):
961 # Remove trailing newlines 962 members = ['description', 'gene_name', 'organism', 'organelle'] 963 for m in members: 964 attr = getattr(rec, m) 965 setattr(rec, m, attr.rstrip()) 966 for ref in rec.references: 967 self._clean_references(ref)
968
969 - def _clean_references(self, ref):
970 # Remove trailing newlines 971 members = ['authors', 'title', 'location'] 972 for m in members: 973 attr = getattr(ref, m) 974 setattr(ref, m, attr.rstrip())
975
976 -class _SequenceConsumer(AbstractConsumer):
977 """Consumer that converts a SwissProt record to a SeqRecord object. 978 979 Members: 980 data Record with SwissProt data. 981 alphabet The alphabet the generated Seq objects will have. 982 """ 983 #TODO - Cope with references as done for GenBank
984 - def __init__(self, alphabet = Alphabet.generic_protein):
985 """Initialize a Sequence Consumer 986 987 Arguments: 988 o alphabet - The alphabet to use for the generated Seq objects. If 989 not supplied this will default to the generic protein alphabet. 990 """ 991 self.data = None 992 self.alphabet = alphabet
993
994 - def start_record(self):
995 seq = Seq.Seq("", self.alphabet) 996 self.data = SeqRecord.SeqRecord(seq) 997 self.data.description = "" 998 self.data.name = "" 999 self._current_ref = None 1000 self._sequence_lines = []
1001
1002 - def end_record(self):
1003 if self._current_ref is not None: 1004 self.data.annotations['references'].append(self._current_ref) 1005 self._current_ref = None 1006 self.data.description = self.data.description.rstrip() 1007 self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet)
1008
1009 - def identification(self, line):
1010 cols = line.split() 1011 self.data.name = cols[1]
1012
1013 - def accession(self, line):
1014 #Note that files can and often do contain multiple AC lines. 1015 ids = line[5:].strip().split(';') 1016 #Remove any white space 1017 ids = [x.strip() for x in ids if x.strip()] 1018 1019 #Use the first as the ID, but record them ALL in the annotations 1020 try : 1021 self.data.annotations['accessions'].extend(ids) 1022 except KeyError : 1023 self.data.annotations['accessions'] = ids 1024 1025 #Use the FIRST accession as the ID, not the first on this line! 1026 self.data.id = self.data.annotations['accessions'][0]
1027 #self.data.id = ids[0] 1028
1029 - def description(self, line):
1030 self.data.description = self.data.description + \ 1031 line[5:].strip() + "\n"
1032
1033 - def sequence_data(self, line):
1034 #It should be faster to make a list of strings, and join them at the end. 1035 self._sequence_lines.append(line.replace(" ", "").rstrip())
1036
1037 - def gene_name(self, line):
1038 #We already store the identification/accession as the records name/id 1039 try : 1040 self.data.annotations['gene_name'] += line[5:] 1041 except KeyError : 1042 self.data.annotations['gene_name'] = line[5:]
1043
1044 - def comment(self, line):
1045 #Try and agree with SeqRecord convention from the GenBank parser, 1046 #which stores the comments as a long string with newlines 1047 #with key 'comment' 1048 try : 1049 self.data.annotations['comment'] += "\n" + line[5:] 1050 except KeyError : 1051 self.data.annotations['comment'] = line[5:]
1052 #TODO - Follow SwissProt conventions more closely? 1053
1054 - def database_cross_reference(self, line):
1055 #Format of the line is described in the manual dated 04-Dec-2007 as: 1056 #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY]. 1057 #However, some older files only seem to have a single identifier: 1058 #DR DATABASE; PRIMARY. 1059 # 1060 #Also must cope with things like this from Tests/SwissProt/sp007, 1061 #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain] 1062 # 1063 #Store these in the dbxref list, but for consistency with 1064 #the GenBank parser and with what BioSQL can cope with, 1065 #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER 1066 parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")] 1067 if len(parts) > 1 : 1068 value = "%s:%s" % (parts[0], parts[1]) 1069 #Avoid duplicate entries 1070 if value not in self.data.dbxrefs : 1071 self.data.dbxrefs.append(value)
1072 #else : 1073 #print "Bad DR line:\n%s" % line 1074 1075
1076 - def date(self, line):
1077 date_str = line.split()[0] 1078 uprline = string.upper(line) 1079 if uprline.find('CREATED') >= 0 : 1080 #Try and agree with SeqRecord convention from the GenBank parser, 1081 #which stores the submitted date as 'date' 1082 self.data.annotations['date'] = date_str 1083 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 : 1084 #There is no existing convention from the GenBank SeqRecord parser 1085 self.data.annotations['date_last_sequence_update'] = date_str 1086 elif uprline.find('LAST ANNOTATION UPDATE') >= 0: 1087 #There is no existing convention from the GenBank SeqRecord parser 1088 self.data.annotations['date_last_annotation_update'] = date_str
1089
1090 - def keyword(self, line):
1091 #Try and agree with SeqRecord convention from the GenBank parser, 1092 #which stores a list as 'keywords' 1093 cols = line[5:].rstrip(_CHOMP).split(';') 1094 cols = [c.strip() for c in cols] 1095 cols = filter(None, cols) 1096 try : 1097 #Extend any existing list of keywords 1098 self.data.annotations['keywords'].extend(cols) 1099 except KeyError : 1100 #Create the list of keywords 1101 self.data.annotations['keywords'] = cols
1102
1103 - def organism_species(self, line):
1104 #Try and agree with SeqRecord convention from the GenBank parser, 1105 #which stores the organism as a string with key 'organism' 1106 data = line[5:].rstrip(_CHOMP) 1107 try : 1108 #Append to any existing data split over multiple lines 1109 self.data.annotations['organism'] += " " + data 1110 except KeyError: 1111 self.data.annotations['organism'] = data
1112
1113 - def organism_host(self, line):
1114 #There is no SeqRecord convention from the GenBank parser, 1115 data = line[5:].rstrip(_CHOMP) 1116 index = data.find('=') 1117 if index >= 0: 1118 descr = data[:index] 1119 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1120 ids = data[index+1:].split(',') 1121 else: 1122 ids = data.split(',') 1123 1124 try : 1125 #Append to any existing data 1126 self.data.annotations['organism_host'].extend(ids) 1127 except KeyError: 1128 self.data.annotations['organism_host'] = ids
1129
1130 - def organism_classification(self, line):
1131 #Try and agree with SeqRecord convention from the GenBank parser, 1132 #which stores this taxonomy lineage ese as a list of strings with 1133 #key 'taxonomy'. 1134 #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX) 1135 line = line[5:].rstrip(_CHOMP) 1136 cols = [col.strip() for col in line.split(';')] 1137 try : 1138 #Append to any existing data 1139 self.data.annotations['taxonomy'].extend(cols) 1140 except KeyError: 1141 self.data.annotations['taxonomy'] = cols
1142
1143 - def taxonomy_id(self, line):
1144 #Try and agree with SeqRecord convention expected in BioSQL 1145 #the NCBI taxon id with key 'ncbi_taxid'. 1146 #Note that 'taxonomy' is used for the taxonomy lineage 1147 #(held as a list of strings, line type OC) 1148 1149 line = line[5:].rstrip(_CHOMP) 1150 index = line.find('=') 1151 if index >= 0: 1152 descr = line[:index] 1153 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1154 ids = line[index+1:].split(',') 1155 else: 1156 ids = line.split(',') 1157 1158 try : 1159 #Append to any existing data 1160 self.data.annotations['ncbi_taxid'].extend(ids) 1161 except KeyError: 1162 self.data.annotations['ncbi_taxid'] = ids
1163
1164 - def reference_number(self, line):
1165 """RN line, reference number (start of new reference).""" 1166 from Bio.SeqFeature import Reference 1167 # if we have a current reference that hasn't been added to 1168 # the list of references, add it. 1169 if self._current_ref is not None: 1170 self.data.annotations['references'].append(self._current_ref) 1171 else: 1172 self.data.annotations['references'] = [] 1173 1174 self._current_ref = Reference()
1175
1176 - def reference_position(self, line):
1177 """RP line, reference position.""" 1178 assert self._current_ref is not None, "RP: missing RN" 1179 #Should try and store this in self._current_ref.location 1180 #but the SwissProt locations don't match easily to the 1181 #format used in GenBank... 1182 pass
1183
1184 - def reference_cross_reference(self, line):
1185 """RX line, reference cross-references.""" 1186 assert self._current_ref is not None, "RX: missing RN" 1187 # The basic (older?) RX line is of the form: 1188 # RX MEDLINE; 85132727. 1189 # or more recently: 1190 # RX MEDLINE=95385798; PubMed=7656980; 1191 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 1192 # We look for these cases first and deal with them 1193 if line.find("=") != -1: 1194 cols = line[2:].split("; ") 1195 cols = [x.strip() for x in cols] 1196 cols = [x for x in cols if x] 1197 for col in cols: 1198 x = col.split("=") 1199 assert len(x) == 2, "I don't understand RX line %s" % line 1200 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 1201 if key == "MEDLINE" : 1202 self._current_ref.medline_id = value 1203 elif key == "PubMed" : 1204 self._current_ref.pubmed_id = value 1205 else : 1206 #Sadly the SeqFeature.Reference object doesn't 1207 #support anything else (yet) 1208 pass 1209 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 1210 else: 1211 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 1212 # have extraneous information in the RX line. Check for 1213 # this and chop it out of the line. 1214 # (noticed by katel@worldpath.net) 1215 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 1216 if ind >= 0: 1217 line = line[:ind] 1218 cols = line.split() 1219 # normally we split into the three parts 1220 assert len(cols) == 3, "I don't understand RX line %s" % line 1221 key = cols[1].rstrip(_CHOMP) 1222 value = cols[2].rstrip(_CHOMP) 1223 if key == "MEDLINE" : 1224 self._current_ref.medline_id = value 1225 elif key == "PubMed" : 1226 self._current_ref.pubmed_id = value 1227 else : 1228 #Sadly the SeqFeature.Reference object doesn't 1229 #support anything else (yet) 1230 pass
1231
1232 - def reference_author(self, line):
1233 """RA line, reference author(s).""" 1234 assert self._current_ref is not None, "RA: missing RN" 1235 self._current_ref.authors += line[5:].rstrip("\n")
1236
1237 - def reference_title(self, line):
1238 """RT line, reference title.""" 1239 assert self._current_ref is not None, "RT: missing RN" 1240 self._current_ref.title += line[5:].rstrip("\n")
1241
1242 - def reference_location(self, line):
1243 """RL line, reference 'location' - journal, volume, pages, year.""" 1244 assert self._current_ref is not None, "RL: missing RN" 1245 self._current_ref.journal += line[5:].rstrip("\n")
1246
1247 - def reference_comment(self, line):
1248 """RC line, reference comment.""" 1249 assert self._current_ref is not None, "RC: missing RN" 1250 #This has a key=value; structure... 1251 #Can we do a better job with the current Reference class? 1252 self._current_ref.comment += line[5:].rstrip("\n")
1253
1254 -def index_file(filename, indexname, rec2key=None):
1255 """index_file(filename, indexname, rec2key=None) 1256 1257 Index a SwissProt file. filename is the name of the file. 1258 indexname is the name of the dictionary. rec2key is an 1259 optional callback that takes a Record and generates a unique key 1260 (e.g. the accession number) for the record. If not specified, 1261 the entry name will be used. 1262 1263 """ 1264 from Bio.SwissProt import parse 1265 if not os.path.exists(filename): 1266 raise ValueError, "%s does not exist" % filename 1267 1268 index = Index.Index(indexname, truncate=1) 1269 index[Dictionary._Dictionary__filename_key] = filename 1270 1271 handle = open(filename) 1272 records = parse(handle) 1273 end = 0L 1274 for record in records: 1275 start = end 1276 end = long(handle.tell()) 1277 length = end - start 1278 1279 if rec2key is not None: 1280 key = rec2key(record) 1281 else: 1282 key = record.entry_name 1283 1284 if not key: 1285 raise KeyError, "empty sequence key was produced" 1286 elif index.has_key(key): 1287 raise KeyError, "duplicate key %s found" % key 1288 1289 index[key] = start, length
1290