Package Bio :: Package Prosite
[hide private]
[frames] | no frames]

Source Code for Package Bio.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """ 
  8  This module provides code to work with the prosite dat file from 
  9  Prosite. 
 10  http://www.expasy.ch/prosite/ 
 11   
 12  Tested with: 
 13  Release 15.0, July 1998 
 14  Release 16.0, July 1999 
 15  Release 17.0, Dec 2001 
 16  Release 19.0, Mar 2006 
 17   
 18   
 19  Functions: 
 20  parse                 Iterates over entries in a Prosite file. 
 21  scan_sequence_expasy  Scan a sequence for occurrences of Prosite patterns. 
 22  index_file            Index a Prosite file for a Dictionary. 
 23  _extract_record       Extract Prosite data from a web page. 
 24  _extract_pattern_hits Extract Prosite patterns from a web page. 
 25   
 26   
 27  Classes: 
 28  Record                Holds Prosite data. 
 29  PatternHit            Holds data from a hit against a Prosite pattern. 
 30  Dictionary            Accesses a Prosite file using a dictionary interface. 
 31  RecordParser          Parses a Prosite record into a Record object. 
 32  Iterator              Iterates over entries in a Prosite file; DEPRECATED. 
 33   
 34  _Scanner              Scans Prosite-formatted data. 
 35  _RecordConsumer       Consumes Prosite data to a Record object. 
 36   
 37  """ 
 38  from types import * 
 39  import re 
 40  import sgmllib 
 41  from Bio import File 
 42  from Bio import Index 
 43  from Bio.ParserSupport import * 
 44   
 45   
 46  # There is probably a cleaner way to write the read/parse functions 
 47  # if we don't use the "parser = RecordParser(); parser.parse(handle)" 
 48  # approach. Leaving that for the next revision of Bio.Prosite. 
49 -def parse(handle):
50 import cStringIO 51 parser = RecordParser() 52 text = "" 53 for line in handle: 54 text += line 55 if line[:2]=='//': 56 handle = cStringIO.StringIO(text) 57 record = parser.parse(handle) 58 text = "" 59 if not record: # Then this was the copyright notice 60 continue 61 yield record
62
63 -def read(handle):
64 parser = RecordParser() 65 try: 66 record = parser.parse(handle) 67 except ValueError, error: 68 if error.message=="There doesn't appear to be a record": 69 raise ValueError, "No Prosite record found" 70 else: 71 raise error 72 # We should have reached the end of the record by now 73 remainder = handle.read() 74 if remainder: 75 raise ValueError, "More than one Prosite record found" 76 return record
77
78 -class Record:
79 """Holds information from a Prosite record. 80 81 Members: 82 name ID of the record. e.g. ADH_ZINC 83 type Type of entry. e.g. PATTERN, MATRIX, or RULE 84 accession e.g. PS00387 85 created Date the entry was created. (MMM-YYYY) 86 data_update Date the 'primary' data was last updated. 87 info_update Date data other than 'primary' data was last updated. 88 pdoc ID of the PROSITE DOCumentation. 89 90 description Free-format description. 91 pattern The PROSITE pattern. See docs. 92 matrix List of strings that describes a matrix entry. 93 rules List of rule definitions (from RU lines). (strings) 94 prorules List of prorules (from PR lines). (strings) 95 96 NUMERICAL RESULTS 97 nr_sp_release SwissProt release. 98 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 99 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 100 nr_positive True positives. tuple of (hits, seqs) 101 nr_unknown Could be positives. tuple of (hits, seqs) 102 nr_false_pos False positives. tuple of (hits, seqs) 103 nr_false_neg False negatives. (int) 104 nr_partial False negatives, because they are fragments. (int) 105 106 COMMENTS 107 cc_taxo_range Taxonomic range. See docs for format 108 cc_max_repeat Maximum number of repetitions in a protein 109 cc_site Interesting site. list of tuples (pattern pos, desc.) 110 cc_skip_flag Can this entry be ignored? 111 cc_matrix_type 112 cc_scaling_db 113 cc_author 114 cc_ft_key 115 cc_ft_desc 116 cc_version version number (introduced in release 19.0) 117 118 DATA BANK REFERENCES - The following are all 119 lists of tuples (swiss-prot accession, 120 swiss-prot name) 121 dr_positive 122 dr_false_neg 123 dr_false_pos 124 dr_potential Potential hits, but fingerprint region not yet available. 125 dr_unknown Could possibly belong 126 127 pdb_structs List of PDB entries. 128 129 """
130 - def __init__(self):
131 self.name = '' 132 self.type = '' 133 self.accession = '' 134 self.created = '' 135 self.data_update = '' 136 self.info_update = '' 137 self.pdoc = '' 138 139 self.description = '' 140 self.pattern = '' 141 self.matrix = [] 142 self.rules = [] 143 self.prorules = [] 144 self.postprocessing = [] 145 146 self.nr_sp_release = '' 147 self.nr_sp_seqs = '' 148 self.nr_total = (None, None) 149 self.nr_positive = (None, None) 150 self.nr_unknown = (None, None) 151 self.nr_false_pos = (None, None) 152 self.nr_false_neg = None 153 self.nr_partial = None 154 155 self.cc_taxo_range = '' 156 self.cc_max_repeat = '' 157 self.cc_site = [] 158 self.cc_skip_flag = '' 159 160 self.dr_positive = [] 161 self.dr_false_neg = [] 162 self.dr_false_pos = [] 163 self.dr_potential = [] 164 self.dr_unknown = [] 165 166 self.pdb_structs = []
167
168 -class PatternHit:
169 """Holds information from a hit against a Prosite pattern. 170 171 Members: 172 name ID of the record. e.g. ADH_ZINC 173 accession e.g. PS00387 174 pdoc ID of the PROSITE DOCumentation. 175 description Free-format description. 176 matches List of tuples (start, end, sequence) where 177 start and end are indexes of the match, and sequence is 178 the sequence matched. 179 180 """
181 - def __init__(self):
182 self.name = None 183 self.accession = None 184 self.pdoc = None 185 self.description = None 186 self.matches = []
187 - def __str__(self):
188 lines = [] 189 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name)) 190 lines.append(self.description) 191 lines.append('') 192 if len(self.matches) > 1: 193 lines.append("Number of matches: %s" % len(self.matches)) 194 for i in range(len(self.matches)): 195 start, end, seq = self.matches[i] 196 range_str = "%d-%d" % (start, end) 197 if len(self.matches) > 1: 198 lines.append("%7d %10s %s" % (i+1, range_str, seq)) 199 else: 200 lines.append("%7s %10s %s" % (' ', range_str, seq)) 201 return "\n".join(lines)
202
203 -class Iterator:
204 """Returns one record at a time from a Prosite file. 205 206 Methods: 207 next Return the next record from the stream, or None. 208 209 """
210 - def __init__(self, handle, parser=None):
211 """__init__(self, handle, parser=None) 212 213 Create a new iterator. handle is a file-like object. parser 214 is an optional Parser object to change the results into another form. 215 If set to None, then the raw contents of the file will be returned. 216 217 """ 218 import warnings 219 warnings.warn("Bio.Prosite.Iterator is deprecated; we recommend using the function Bio.Prosite.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.parse instead of Bio.Prosite.Iterator.", 220 DeprecationWarning) 221 if type(handle) is not FileType and type(handle) is not InstanceType: 222 raise ValueError, "I expected a file handle or file-like object" 223 self._uhandle = File.UndoHandle(handle) 224 self._parser = parser
225
226 - def next(self):
227 """next(self) -> object 228 229 Return the next Prosite record from the file. If no more records, 230 return None. 231 232 """ 233 # Skip the copyright info, if it's the first record. 234 line = self._uhandle.peekline() 235 if line[:2] == 'CC': 236 while 1: 237 line = self._uhandle.readline() 238 if not line: 239 break 240 if line[:2] == '//': 241 break 242 if line[:2] != 'CC': 243 raise ValueError, \ 244 "Oops, where's the copyright?" 245 246 lines = [] 247 while 1: 248 line = self._uhandle.readline() 249 if not line: 250 break 251 lines.append(line) 252 if line[:2] == '//': 253 break 254 255 if not lines: 256 return None 257 258 data = "".join(lines) 259 if self._parser is not None: 260 return self._parser.parse(File.StringHandle(data)) 261 return data
262
263 - def __iter__(self):
264 return iter(self.next, None)
265
266 -class Dictionary:
267 """Accesses a Prosite file using a dictionary interface. 268 269 """ 270 __filename_key = '__filename' 271
272 - def __init__(self, indexname, parser=None):
273 """__init__(self, indexname, parser=None) 274 275 Open a Prosite Dictionary. indexname is the name of the 276 index for the dictionary. The index should have been created 277 using the index_file function. parser is an optional Parser 278 object to change the results into another form. If set to None, 279 then the raw contents of the file will be returned. 280 281 """ 282 self._index = Index.Index(indexname) 283 self._handle = open(self._index[Dictionary.__filename_key]) 284 self._parser = parser
285
286 - def __len__(self):
287 return len(self._index)
288
289 - def __getitem__(self, key):
290 start, len = self._index[key] 291 self._handle.seek(start) 292 data = self._handle.read(len) 293 if self._parser is not None: 294 return self._parser.parse(File.StringHandle(data)) 295 return data
296
297 - def __getattr__(self, name):
298 return getattr(self._index, name)
299
300 -class ExPASyDictionary:
301 """Access PROSITE at ExPASy using a read-only dictionary interface. 302 303 """
304 - def __init__(self, delay=5.0, parser=None):
305 """__init__(self, delay=5.0, parser=None) 306 307 Create a new Dictionary to access PROSITE. parser is an optional 308 parser (e.g. Prosite.RecordParser) object to change the results 309 into another form. If set to None, then the raw contents of the 310 file will be returned. delay is the number of seconds to wait 311 between each query. 312 313 """ 314 import warnings 315 from Bio.WWW import RequestLimiter 316 warnings.warn("Bio.Prosite.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.", 317 DeprecationWarning) 318 self.parser = parser 319 self.limiter = RequestLimiter(delay)
320
321 - def __len__(self):
322 raise NotImplementedError, "Prosite contains lots of entries"
323 - def clear(self):
324 raise NotImplementedError, "This is a read-only dictionary"
325 - def __setitem__(self, key, item):
326 raise NotImplementedError, "This is a read-only dictionary"
327 - def update(self):
328 raise NotImplementedError, "This is a read-only dictionary"
329 - def copy(self):
330 raise NotImplementedError, "You don't need to do this..."
331 - def keys(self):
332 raise NotImplementedError, "You don't really want to do this..."
333 - def items(self):
334 raise NotImplementedError, "You don't really want to do this..."
335 - def values(self):
336 raise NotImplementedError, "You don't really want to do this..."
337
338 - def has_key(self, id):
339 """has_key(self, id) -> bool""" 340 try: 341 self[id] 342 except KeyError: 343 return 0 344 return 1
345
346 - def get(self, id, failobj=None):
347 try: 348 return self[id] 349 except KeyError: 350 return failobj 351 raise "How did I get here?"
352
353 - def __getitem__(self, id):
354 """__getitem__(self, id) -> object 355 356 Return a Prosite entry. id is either the id or accession 357 for the entry. Raises a KeyError if there's an error. 358 359 """ 360 from Bio.WWW import ExPASy 361 # First, check to see if enough time has passed since my 362 # last query. 363 self.limiter.wait() 364 365 try: 366 handle = ExPASy.get_prosite_entry(id) 367 except IOError: 368 raise KeyError, id 369 try: 370 handle = File.StringHandle(_extract_record(handle)) 371 except ValueError: 372 raise KeyError, id 373 374 if self.parser is not None: 375 return self.parser.parse(handle) 376 return handle.read()
377
378 -class RecordParser(AbstractParser):
379 """Parses Prosite data into a Record object. 380 381 """
382 - def __init__(self):
383 self._scanner = _Scanner() 384 self._consumer = _RecordConsumer()
385
386 - def parse(self, handle):
387 self._scanner.feed(handle, self._consumer) 388 return self._consumer.data
389
390 -class _Scanner:
391 """Scans Prosite-formatted data. 392 393 Tested with: 394 Release 15.0, July 1998 395 396 """
397 - def feed(self, handle, consumer):
398 """feed(self, handle, consumer) 399 400 Feed in Prosite data for scanning. handle is a file-like 401 object that contains prosite data. consumer is a 402 Consumer object that will receive events as the report is scanned. 403 404 """ 405 if isinstance(handle, File.UndoHandle): 406 uhandle = handle 407 else: 408 uhandle = File.UndoHandle(handle) 409 410 consumer.finished = False 411 while not consumer.finished: 412 line = uhandle.peekline() 413 if not line: 414 break 415 elif is_blank_line(line): 416 # Skip blank lines between records 417 uhandle.readline() 418 continue 419 elif line[:2] == 'ID': 420 self._scan_record(uhandle, consumer) 421 elif line[:2] == 'CC': 422 self._scan_copyrights(uhandle, consumer) 423 else: 424 raise ValueError, "There doesn't appear to be a record"
425
426 - def _scan_copyrights(self, uhandle, consumer):
427 consumer.start_copyrights() 428 self._scan_line('CC', uhandle, consumer.copyright, any_number=1) 429 self._scan_terminator(uhandle, consumer) 430 consumer.end_copyrights()
431
432 - def _scan_record(self, uhandle, consumer):
433 consumer.start_record() 434 for fn in self._scan_fns: 435 fn(self, uhandle, consumer) 436 437 # In Release 15.0, C_TYPE_LECTIN_1 has the DO line before 438 # the 3D lines, instead of the other way around. 439 # Thus, I'll give the 3D lines another chance after the DO lines 440 # are finished. 441 if fn is self._scan_do.im_func: 442 self._scan_3d(uhandle, consumer) 443 consumer.end_record()
444
445 - def _scan_line(self, line_type, uhandle, event_fn, 446 exactly_one=None, one_or_more=None, any_number=None, 447 up_to_one=None):
448 # Callers must set exactly one of exactly_one, one_or_more, or 449 # any_number to a true value. I do not explicitly check to 450 # make sure this function is called correctly. 451 452 # This does not guarantee any parameter safety, but I 453 # like the readability. The other strategy I tried was have 454 # parameters min_lines, max_lines. 455 456 if exactly_one or one_or_more: 457 read_and_call(uhandle, event_fn, start=line_type) 458 if one_or_more or any_number: 459 while 1: 460 if not attempt_read_and_call(uhandle, event_fn, 461 start=line_type): 462 break 463 if up_to_one: 464 attempt_read_and_call(uhandle, event_fn, start=line_type)
465
466 - def _scan_id(self, uhandle, consumer):
467 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
468
469 - def _scan_ac(self, uhandle, consumer):
470 self._scan_line('AC', uhandle, consumer.accession, exactly_one=1)
471
472 - def _scan_dt(self, uhandle, consumer):
473 self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
474
475 - def _scan_de(self, uhandle, consumer):
476 self._scan_line('DE', uhandle, consumer.description, exactly_one=1)
477
478 - def _scan_pa(self, uhandle, consumer):
479 self._scan_line('PA', uhandle, consumer.pattern, any_number=1)
480
481 - def _scan_ma(self, uhandle, consumer):
482 self._scan_line('MA', uhandle, consumer.matrix, any_number=1)
483 ## # ZN2_CY6_FUNGAL_2, DNAJ_2 in Release 15 484 ## # contain a CC line buried within an 'MA' line. Need to check 485 ## # for that. 486 ## while 1: 487 ## if not attempt_read_and_call(uhandle, consumer.matrix, start='MA'): 488 ## line1 = uhandle.readline() 489 ## line2 = uhandle.readline() 490 ## uhandle.saveline(line2) 491 ## uhandle.saveline(line1) 492 ## if line1[:2] == 'CC' and line2[:2] == 'MA': 493 ## read_and_call(uhandle, consumer.comment, start='CC') 494 ## else: 495 ## break 496
497 - def _scan_pp(self, uhandle, consumer):
498 #New PP line, PostProcessing, just after the MA line 499 self._scan_line('PP', uhandle, consumer.postprocessing, any_number=1)
500
501 - def _scan_ru(self, uhandle, consumer):
502 self._scan_line('RU', uhandle, consumer.rule, any_number=1)
503
504 - def _scan_nr(self, uhandle, consumer):
505 self._scan_line('NR', uhandle, consumer.numerical_results, 506 any_number=1)
507
508 - def _scan_cc(self, uhandle, consumer):
509 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
510
511 - def _scan_dr(self, uhandle, consumer):
512 self._scan_line('DR', uhandle, consumer.database_reference, 513 any_number=1)
514
515 - def _scan_3d(self, uhandle, consumer):
516 self._scan_line('3D', uhandle, consumer.pdb_reference, 517 any_number=1)
518
519 - def _scan_pr(self, uhandle, consumer):
520 #New PR line, ProRule, between 3D and DO lines 521 self._scan_line('PR', uhandle, consumer.prorule, any_number=1)
522
523 - def _scan_do(self, uhandle, consumer):
524 self._scan_line('DO', uhandle, consumer.documentation, exactly_one=1)
525
526 - def _scan_terminator(self, uhandle, consumer):
527 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
528 529 #This is a list of scan functions in the order expected in the file file. 530 #The function definitions define how many times each line type is exected 531 #(or if optional): 532 _scan_fns = [ 533 _scan_id, 534 _scan_ac, 535 _scan_dt, 536 _scan_de, 537 _scan_pa, 538 _scan_ma, 539 _scan_pp, 540 _scan_ru, 541 _scan_nr, 542 _scan_cc, 543 544 # This is a really dirty hack, and should be fixed properly at 545 # some point. ZN2_CY6_FUNGAL_2, DNAJ_2 in Rel 15 and PS50309 546 # in Rel 17 have lines out of order. Thus, I have to rescan 547 # these, which decreases performance. 548 _scan_ma, 549 _scan_nr, 550 _scan_cc, 551 552 _scan_dr, 553 _scan_3d, 554 _scan_pr, 555 _scan_do, 556 _scan_terminator 557 ]
558
559 -class _RecordConsumer(AbstractConsumer):
560 """Consumer that converts a Prosite record to a Record object. 561 562 Members: 563 data Record with Prosite data. 564 565 """
566 - def __init__(self):
567 self.data = None
568
569 - def start_record(self):
570 self.data = Record()
571
572 - def end_record(self):
573 self._clean_record(self.data)
574
575 - def identification(self, line):
576 cols = line.split() 577 if len(cols) != 3: 578 raise ValueError, "I don't understand identification line\n%s" % \ 579 line 580 self.data.name = self._chomp(cols[1]) # don't want ';' 581 self.data.type = self._chomp(cols[2]) # don't want '.'
582
583 - def accession(self, line):
584 cols = line.split() 585 if len(cols) != 2: 586 raise ValueError, "I don't understand accession line\n%s" % line 587 self.data.accession = self._chomp(cols[1])
588
589 - def date(self, line):
590 uprline = line.upper() 591 cols = uprline.split() 592 593 # Release 15.0 contains both 'INFO UPDATE' and 'INF UPDATE' 594 if cols[2] != '(CREATED);' or \ 595 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \ 596 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).': 597 raise ValueError, "I don't understand date line\n%s" % line 598 599 self.data.created = cols[1] 600 self.data.data_update = cols[3] 601 self.data.info_update = cols[6]
602
603 - def description(self, line):
604 self.data.description = self._clean(line)
605
606 - def pattern(self, line):
607 self.data.pattern = self.data.pattern + self._clean(line)
608
609 - def matrix(self, line):
610 self.data.matrix.append(self._clean(line))
611
612 - def postprocessing(self, line):
615
616 - def rule(self, line):
617 self.data.rules.append(self._clean(line))
618
619 - def numerical_results(self, line):
620 cols = self._clean(line).split(";") 621 for col in cols: 622 if not col: 623 continue 624 qual, data = [word.lstrip() for word in col.split("=")] 625 if qual == '/RELEASE': 626 release, seqs = data.split(",") 627 self.data.nr_sp_release = release 628 self.data.nr_sp_seqs = int(seqs) 629 elif qual == '/FALSE_NEG': 630 self.data.nr_false_neg = int(data) 631 elif qual == '/PARTIAL': 632 self.data.nr_partial = int(data) 633 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 634 m = re.match(r'(\d+)\((\d+)\)', data) 635 if not m: 636 raise error, "Broken data %s in comment line\n%s" % \ 637 (repr(data), line) 638 hits = tuple(map(int, m.groups())) 639 if(qual == "/TOTAL"): 640 self.data.nr_total = hits 641 elif(qual == "/POSITIVE"): 642 self.data.nr_positive = hits 643 elif(qual == "/UNKNOWN"): 644 self.data.nr_unknown = hits 645 elif(qual == "/FALSE_POS"): 646 self.data.nr_false_pos = hits 647 else: 648 raise ValueError, "Unknown qual %s in comment line\n%s" % \ 649 (repr(qual), line)
650
651 - def comment(self, line):
652 #Expect CC lines like this: 653 #CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 654 #Can (normally) split on ";" and then on "=" 655 cols = self._clean(line).split(";") 656 for col in cols: 657 if not col or col[:17] == 'Automatic scaling': 658 # DNAJ_2 in Release 15 has a non-standard comment line: 659 # CC Automatic scaling using reversed database 660 # Throw it away. (Should I keep it?) 661 continue 662 if col.count("=") == 0 : 663 #Missing qualifier! Can we recover gracefully? 664 #For example, from Bug 2403, in PS50293 have: 665 #CC /AUTHOR=K_Hofmann; N_Hulo 666 continue 667 qual, data = [word.lstrip() for word in col.split("=")] 668 if qual == '/TAXO-RANGE': 669 self.data.cc_taxo_range = data 670 elif qual == '/MAX-REPEAT': 671 self.data.cc_max_repeat = data 672 elif qual == '/SITE': 673 pos, desc = data.split(",") 674 self.data.cc_site.append((int(pos), desc)) 675 elif qual == '/SKIP-FLAG': 676 self.data.cc_skip_flag = data 677 elif qual == '/MATRIX_TYPE': 678 self.data.cc_matrix_type = data 679 elif qual == '/SCALING_DB': 680 self.data.cc_scaling_db = data 681 elif qual == '/AUTHOR': 682 self.data.cc_author = data 683 elif qual == '/FT_KEY': 684 self.data.cc_ft_key = data 685 elif qual == '/FT_DESC': 686 self.data.cc_ft_desc = data 687 elif qual == '/VERSION': 688 self.data.cc_version = data 689 else: 690 raise ValueError, "Unknown qual %s in comment line\n%s" % \ 691 (repr(qual), line)
692
693 - def database_reference(self, line):
694 refs = self._clean(line).split(";") 695 for ref in refs: 696 if not ref: 697 continue 698 acc, name, type = [word.strip() for word in ref.split(",")] 699 if type == 'T': 700 self.data.dr_positive.append((acc, name)) 701 elif type == 'F': 702 self.data.dr_false_pos.append((acc, name)) 703 elif type == 'N': 704 self.data.dr_false_neg.append((acc, name)) 705 elif type == 'P': 706 self.data.dr_potential.append((acc, name)) 707 elif type == '?': 708 self.data.dr_unknown.append((acc, name)) 709 else: 710 raise ValueError, "I don't understand type flag %s" % type
711
712 - def pdb_reference(self, line):
713 cols = line.split() 714 for id in cols[1:]: # get all but the '3D' col 715 self.data.pdb_structs.append(self._chomp(id))
716
717 - def prorule(self, line):
718 #Assume that each PR line can contain multiple ";" separated rules 719 rules = self._clean(line).split(";") 720 self.data.prorules.extend(rules)
721
722 - def documentation(self, line):
723 self.data.pdoc = self._chomp(self._clean(line))
724
725 - def terminator(self, line):
726 self.finished = True
727
728 - def _chomp(self, word, to_chomp='.,;'):
729 # Remove the punctuation at the end of a word. 730 if word[-1] in to_chomp: 731 return word[:-1] 732 return word
733
734 - def _clean(self, line, rstrip=1):
735 # Clean up a line. 736 if rstrip: 737 return line[5:].rstrip() 738 return line[5:]
739
740 -def scan_sequence_expasy(seq=None, id=None, exclude_frequent=None):
741 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) -> 742 list of PatternHit's 743 744 Search a sequence for occurrences of Prosite patterns. You can 745 specify either a sequence in seq or a SwissProt/trEMBL ID or accession 746 in id. Only one of those should be given. If exclude_frequent 747 is true, then the patterns with the high probability of occurring 748 will be excluded. 749 750 """ 751 from Bio import ExPASy 752 if (seq and id) or not (seq or id): 753 raise ValueError, "Please specify either a sequence or an id" 754 handle = ExPASy.scanprosite1(seq, id, exclude_frequent) 755 return _extract_pattern_hits(handle)
756
757 -def _extract_pattern_hits(handle):
758 """_extract_pattern_hits(handle) -> list of PatternHit's 759 760 Extract hits from a web page. Raises a ValueError if there 761 was an error in the query. 762 763 """ 764 class parser(sgmllib.SGMLParser): 765 def __init__(self): 766 sgmllib.SGMLParser.__init__(self) 767 self.hits = [] 768 self.broken_message = 'Some error occurred' 769 self._in_pre = 0 770 self._current_hit = None 771 self._last_found = None # Save state of parsing
772 def handle_data(self, data): 773 if data.find('try again') >= 0: 774 self.broken_message = data 775 return 776 elif data == 'illegal': 777 self.broken_message = 'Sequence contains illegal characters' 778 return 779 if not self._in_pre: 780 return 781 elif not data.strip(): 782 return 783 if self._last_found is None and data[:4] == 'PDOC': 784 self._current_hit.pdoc = data 785 self._last_found = 'pdoc' 786 elif self._last_found == 'pdoc': 787 if data[:2] != 'PS': 788 raise ValueError, "Expected accession but got:\n%s" % data 789 self._current_hit.accession = data 790 self._last_found = 'accession' 791 elif self._last_found == 'accession': 792 self._current_hit.name = data 793 self._last_found = 'name' 794 elif self._last_found == 'name': 795 self._current_hit.description = data 796 self._last_found = 'description' 797 elif self._last_found == 'description': 798 m = re.findall(r'(\d+)-(\d+) (\w+)', data) 799 for start, end, seq in m: 800 self._current_hit.matches.append( 801 (int(start), int(end), seq)) 802 803 def do_hr(self, attrs): 804 # <HR> inside a <PRE> section means a new hit. 805 if self._in_pre: 806 self._current_hit = PatternHit() 807 self.hits.append(self._current_hit) 808 self._last_found = None 809 def start_pre(self, attrs): 810 self._in_pre = 1 811 self.broken_message = None # Probably not broken 812 def end_pre(self): 813 self._in_pre = 0 814 p = parser() 815 p.feed(handle.read()) 816 if p.broken_message: 817 raise ValueError, p.broken_message 818 return p.hits 819 820 821 822
823 -def index_file(filename, indexname, rec2key=None):
824 """index_file(filename, indexname, rec2key=None) 825 826 Index a Prosite file. filename is the name of the file. 827 indexname is the name of the dictionary. rec2key is an 828 optional callback that takes a Record and generates a unique key 829 (e.g. the accession number) for the record. If not specified, 830 the id name will be used. 831 832 """ 833 import os 834 if not os.path.exists(filename): 835 raise ValueError, "%s does not exist" % filename 836 837 index = Index.Index(indexname, truncate=1) 838 index[Dictionary._Dictionary__filename_key] = filename 839 840 handle = open(filename) 841 records = parse(handle) 842 end = 0L 843 for record in records: 844 start = end 845 end = long(handle.tell()) 846 length = end - start 847 848 if rec2key is not None: 849 key = rec2key(record) 850 else: 851 key = record.name 852 853 if not key: 854 raise KeyError, "empty key was produced" 855 elif index.has_key(key): 856 raise KeyError, "duplicate key %s found" % key 857 858 index[key] = start, length
859 860 # This function can be deprecated once Bio.Prosite.ExPASyDictionary 861 # is removed.
862 -def _extract_record(handle):
863 """_extract_record(handle) -> str 864 865 Extract PROSITE data from a web page. Raises a ValueError if no 866 data was found in the web page. 867 868 """ 869 # All the data appears between tags: 870 # <pre width = 80>ID NIR_SIR; PATTERN. 871 # </PRE> 872 class parser(sgmllib.SGMLParser): 873 def __init__(self): 874 sgmllib.SGMLParser.__init__(self) 875 self._in_pre = 0 876 self.data = []
877 def handle_data(self, data): 878 if self._in_pre: 879 self.data.append(data) 880 def do_br(self, attrs): 881 if self._in_pre: 882 self.data.append('\n') 883 def start_pre(self, attrs): 884 self._in_pre = 1 885 def end_pre(self): 886 self._in_pre = 0 887 p = parser() 888 p.feed(handle.read()) 889 if not p.data: 890 raise ValueError, "No data found in web page." 891 return "".join(p.data) 892