Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST (direct use discouraged). 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15                      You are expected to use this via the parse function. 
 16   
 17  _XMLParser          Generic SAX parser (private). 
 18   
 19  Functions: 
 20  parse               Incremental parser, this is an iterator that returns 
 21                      Blast records.  It uses the BlastParser internally. 
 22  """ 
 23  from Bio.Blast import Record 
 24  import xml.sax 
 25  from xml.sax.handler import ContentHandler 
 26   
27 -class _XMLparser(ContentHandler):
28 """Generic SAX Parser 29 30 Just a very basic SAX parser. 31 32 Redefine the methods startElement, characters and endElement. 33 """
34 - def __init__(self, debug=0):
35 """Constructor 36 37 debug - integer, amount of debug information to print 38 """ 39 self._tag = [] 40 self._value = '' 41 self._debug = debug 42 self._debug_ignore_list = []
43
44 - def _secure_name(self, name):
45 """Removes 'dangerous' from tag names 46 47 name -- name to be 'secured' 48 """ 49 # Replace '-' with '_' in XML tag names 50 return name.replace('-', '_')
51
52 - def startElement(self, name, attr):
53 """Found XML start tag 54 55 No real need of attr, BLAST DTD doesn't use them 56 57 name -- name of the tag 58 59 attr -- tag attributes 60 """ 61 self._tag.append(name) 62 63 # Try to call a method (defined in subclasses) 64 method = self._secure_name('_start_' + name) 65 66 #Note could use try / except AttributeError 67 #BUT I found often triggered by nested errors... 68 if hasattr(self, method) : 69 eval("self.%s()" % method) 70 if self._debug > 4 : 71 print "NCBIXML: Parsed: " + method 72 else : 73 # Doesn't exist (yet) 74 if method not in self._debug_ignore_list : 75 if self._debug > 3 : 76 print "NCBIXML: Ignored: " + method 77 self._debug_ignore_list.append(method) 78 79 #We don't care about white space in parent tags like Hsp, 80 #but that white space doesn't belong to child tags like Hsp_midline 81 if self._value.strip() : 82 raise ValueError("What should we do with %s before the %s tag?" \ 83 % (repr(self._value), name)) 84 self._value = ""
85
86 - def characters(self, ch):
87 """Found some text 88 89 ch -- characters read 90 """ 91 self._value += ch # You don't ever get the whole string
92
93 - def endElement(self, name):
94 """Found XML end tag 95 96 name -- tag name 97 """ 98 # DON'T strip any white space, we may need it e.g. the hsp-midline 99 100 # Try to call a method (defined in subclasses) 101 method = self._secure_name('_end_' + name) 102 #Note could use try / except AttributeError 103 #BUT I found often triggered by nested errors... 104 if hasattr(self, method) : 105 eval("self.%s()" % method) 106 if self._debug > 2 : 107 print "NCBIXML: Parsed: " + method, self._value 108 else : 109 # Doesn't exist (yet) 110 if method not in self._debug_ignore_list : 111 if self._debug > 1 : 112 print "NCBIXML: Ignored: " + method, self._value 113 self._debug_ignore_list.append(method) 114 115 # Reset character buffer 116 self._value = ''
117
118 -class BlastParser(_XMLparser):
119 """Parse XML BLAST data into a Record.Blast object 120 121 All XML 'action' methods are private methods and may be: 122 _start_TAG called when the start tag is found 123 _end_TAG called when the end tag is found 124 """ 125
126 - def __init__(self, debug=0):
127 """Constructor 128 129 debug - integer, amount of debug information to print 130 """ 131 # Calling superclass method 132 _XMLparser.__init__(self, debug) 133 134 self._parser = xml.sax.make_parser() 135 self._parser.setContentHandler(self) 136 137 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 138 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 139 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 140 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 141 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 142 143 self.reset()
144
145 - def reset(self) :
146 """Reset all the data allowing reuse of the BlastParser() object""" 147 self._records = [] 148 self._header = Record.Header() 149 self._parameters = Record.Parameters() 150 self._parameters.filter = None #Maybe I should update the class?
151
152 - def _start_Iteration(self):
153 self._blast = Record.Blast() 154 pass
155
156 - def _end_Iteration(self):
157 # We stored a lot of generic "top level" information 158 # in self._header (an object of type Record.Header) 159 self._blast.reference = self._header.reference 160 self._blast.date = self._header.date 161 self._blast.version = self._header.version 162 self._blast.database = self._header.database 163 self._blast.application = self._header.application 164 165 # These are required for "old" pre 2.2.14 files 166 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 167 # and <BlastOutput_query-len> were used. Now they 168 # are suplemented/replaced by <Iteration_query-ID>, 169 # <Iteration_query-def> and <Iteration_query-len> 170 if not hasattr(self._blast, "query") \ 171 or not self._blast.query : 172 self._blast.query = self._header.query 173 if not hasattr(self._blast, "query_id") \ 174 or not self._blast.query_id : 175 self._blast.query_id = self._header.query_id 176 if not hasattr(self._blast, "query_letters") \ 177 or not self._blast.query_letters : 178 self._blast.query_letters = self._header.query_letters 179 180 # Hack to record the query length as both the query_letters and 181 # query_length properties (as in the plain text parser, see 182 # Bug 2176 comment 12): 183 self._blast.query_length = self._blast.query_letters 184 # Perhaps in the long term we should deprecate one, but I would 185 # prefer to drop query_letters - so we need a transition period 186 # with both. 187 188 # Hack to record the claimed database size as database_length 189 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 190 self._blast.database_length = self._blast.num_letters_in_database 191 # TODO? Deprecate database_letters next? 192 193 # Apply the "top level" parameter information 194 self._blast.matrix = self._parameters.matrix 195 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 196 self._blast.gap_penalties = self._parameters.gap_penalties 197 self._blast.filter = self._parameters.filter 198 self._blast.expect = self._parameters.expect 199 self._blast.sc_match = self._parameters.sc_match 200 self._blast.sc_mismatch = self._parameters.sc_mismatch 201 202 #Add to the list 203 self._records.append(self._blast) 204 #Clear the object (a new empty one is create in _start_Iteration) 205 self._blast = None 206 207 if self._debug : "NCBIXML: Added Blast record to results"
208 209 # Header
210 - def _end_BlastOutput_program(self):
211 """BLAST program, e.g., blastp, blastn, etc. 212 213 Save this to put on each blast record object 214 """ 215 self._header.application = self._value.upper()
216
217 - def _end_BlastOutput_version(self):
218 """version number and date of the BLAST engine. 219 220 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 221 variants like "BLASTP 2.2.18+" without the date. 222 223 Save this to put on each blast record object 224 """ 225 parts = self._value.split() 226 #TODO - Check the first word starts with BLAST? 227 228 #The version is the second word (field one) 229 self._header.version = parts[1] 230 231 #Check there is a third word (the date) 232 if len(parts) >= 3 : 233 if parts[2][0] == "[" and parts[2][-1] == "]" : 234 self._header.date = parts[2][1:-1] 235 else : 236 #Assume this is still a date, but without the 237 #square brackets 238 self._header.date = parts[2]
239
241 """a reference to the article describing the algorithm 242 243 Save this to put on each blast record object 244 """ 245 self._header.reference = self._value
246
247 - def _end_BlastOutput_db(self):
248 """the database(s) searched 249 250 Save this to put on each blast record object 251 """ 252 self._header.database = self._value
253
255 """the identifier of the query 256 257 Important in old pre 2.2.14 BLAST, for recent versions 258 <Iteration_query-ID> is enough 259 """ 260 self._header.query_id = self._value
261
263 """the definition line of the query 264 265 Important in old pre 2.2.14 BLAST, for recent versions 266 <Iteration_query-def> is enough 267 """ 268 self._header.query = self._value
269
271 """the length of the query 272 273 Important in old pre 2.2.14 BLAST, for recent versions 274 <Iteration_query-len> is enough 275 """ 276 self._header.query_letters = int(self._value)
277
278 - def _end_Iteration_query_ID(self):
279 """the identifier of the query 280 """ 281 self._blast.query_id = self._value
282
283 - def _end_Iteration_query_def(self):
284 """the definition line of the query 285 """ 286 self._blast.query = self._value
287
288 - def _end_Iteration_query_len(self):
289 """the length of the query 290 """ 291 self._blast.query_letters = int(self._value)
292 293 ## def _end_BlastOutput_query_seq(self): 294 ## """the query sequence 295 ## """ 296 ## pass # XXX Missing in Record.Blast ? 297 298 ## def _end_BlastOutput_iter_num(self): 299 ## """the psi-blast iteration number 300 ## """ 301 ## pass # XXX TODO PSI 302
303 - def _end_BlastOutput_hits(self):
304 """hits to the database sequences, one for every sequence 305 """ 306 self._blast.num_hits = int(self._value)
307 308 ## def _end_BlastOutput_message(self): 309 ## """error messages 310 ## """ 311 ## pass # XXX What to do ? 312 313 # Parameters
314 - def _end_Parameters_matrix(self):
315 """matrix used (-M) 316 """ 317 self._parameters.matrix = self._value
318
319 - def _end_Parameters_expect(self):
320 """expect values cutoff (-e) 321 """ 322 # NOTE: In old text output there was a line: 323 # Number of sequences better than 1.0e-004: 1 324 # As far as I can see, parameters.num_seqs_better_e 325 # would take the value of 1, and the expectation 326 # value was not recorded. 327 # 328 # Anyway we should NOT record this against num_seqs_better_e 329 self._parameters.expect = self._value
330 331 ## def _end_Parameters_include(self): 332 ## """inclusion threshold for a psi-blast iteration (-h) 333 ## """ 334 ## pass # XXX TODO PSI 335
336 - def _end_Parameters_sc_match(self):
337 """match score for nucleotide-nucleotide comparaison (-r) 338 """ 339 self._parameters.sc_match = int(self._value)
340
342 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 343 """ 344 self._parameters.sc_mismatch = int(self._value)
345
346 - def _end_Parameters_gap_open(self):
347 """gap existence cost (-G) 348 """ 349 self._parameters.gap_penalties = int(self._value)
350
352 """gap extension cose (-E) 353 """ 354 self._parameters.gap_penalties = (self._parameters.gap_penalties, 355 int(self._value))
356
357 - def _end_Parameters_filter(self):
358 """filtering options (-F) 359 """ 360 self._parameters.filter = self._value
361 362 ## def _end_Parameters_pattern(self): 363 ## """pattern used for phi-blast search 364 ## """ 365 ## pass # XXX TODO PSI 366 367 ## def _end_Parameters_entrez_query(self): 368 ## """entrez query used to limit search 369 ## """ 370 ## pass # XXX TODO PSI 371 372 # Hits
373 - def _start_Hit(self):
374 self._blast.alignments.append(Record.Alignment()) 375 self._blast.descriptions.append(Record.Description()) 376 self._blast.multiple_alignment = [] 377 self._hit = self._blast.alignments[-1] 378 self._descr = self._blast.descriptions[-1] 379 self._descr.num_alignments = 0
380
381 - def _end_Hit(self):
382 #Cleanup 383 self._blast.multiple_alignment = None 384 self._hit = None 385 self._descr = None
386
387 - def _end_Hit_id(self):
388 """identifier of the database sequence 389 """ 390 self._hit.hit_id = self._value 391 self._hit.title = self._value + ' '
392
393 - def _end_Hit_def(self):
394 """definition line of the database sequence 395 """ 396 self._hit.hit_def = self._value 397 self._hit.title += self._value 398 self._descr.title = self._hit.title
399
400 - def _end_Hit_accession(self):
401 """accession of the database sequence 402 """ 403 self._hit.accession = self._value 404 self._descr.accession = self._value
405
406 - def _end_Hit_len(self):
407 self._hit.length = int(self._value)
408 409 # HSPs
410 - def _start_Hsp(self):
411 #Note that self._start_Hit() should have been called 412 #to setup things like self._blast.multiple_alignment 413 self._hit.hsps.append(Record.HSP()) 414 self._hsp = self._hit.hsps[-1] 415 self._descr.num_alignments += 1 416 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 417 self._mult_al = self._blast.multiple_alignment[-1]
418 419 # Hsp_num is useless
420 - def _end_Hsp_score(self):
421 """raw score of HSP 422 """ 423 self._hsp.score = float(self._value) 424 if self._descr.score == None: 425 self._descr.score = float(self._value)
426
427 - def _end_Hsp_bit_score(self):
428 """bit score of HSP 429 """ 430 self._hsp.bits = float(self._value) 431 if self._descr.bits == None: 432 self._descr.bits = float(self._value)
433
434 - def _end_Hsp_evalue(self):
435 """expect value value of the HSP 436 """ 437 self._hsp.expect = float(self._value) 438 if self._descr.e == None: 439 self._descr.e = float(self._value)
440
441 - def _end_Hsp_query_from(self):
442 """offset of query at the start of the alignment (one-offset) 443 """ 444 self._hsp.query_start = int(self._value)
445
446 - def _end_Hsp_query_to(self):
447 """offset of query at the end of the alignment (one-offset) 448 """ 449 self._hsp.query_end = int(self._value)
450
451 - def _end_Hsp_hit_from(self):
452 """offset of the database at the start of the alignment (one-offset) 453 """ 454 self._hsp.sbjct_start = int(self._value)
455
456 - def _end_Hsp_hit_to(self):
457 """offset of the database at the end of the alignment (one-offset) 458 """ 459 self._hsp.sbjct_end = int(self._value)
460 461 ## def _end_Hsp_pattern_from(self): 462 ## """start of phi-blast pattern on the query (one-offset) 463 ## """ 464 ## pass # XXX TODO PSI 465 466 ## def _end_Hsp_pattern_to(self): 467 ## """end of phi-blast pattern on the query (one-offset) 468 ## """ 469 ## pass # XXX TODO PSI 470
471 - def _end_Hsp_query_frame(self):
472 """frame of the query if applicable 473 """ 474 self._hsp.frame = (int(self._value),)
475
476 - def _end_Hsp_hit_frame(self):
477 """frame of the database sequence if applicable 478 """ 479 self._hsp.frame += (int(self._value),)
480
481 - def _end_Hsp_identity(self):
482 """number of identities in the alignment 483 """ 484 self._hsp.identities = int(self._value)
485
486 - def _end_Hsp_positive(self):
487 """number of positive (conservative) substitutions in the alignment 488 """ 489 self._hsp.positives = int(self._value)
490
491 - def _end_Hsp_gaps(self):
492 """number of gaps in the alignment 493 """ 494 self._hsp.gaps = int(self._value)
495
496 - def _end_Hsp_align_len(self):
497 """length of the alignment 498 """ 499 self._hsp.align_length = int(self._value)
500 501 ## def _en_Hsp_density(self): 502 ## """score density 503 ## """ 504 ## pass # XXX ??? 505
506 - def _end_Hsp_qseq(self):
507 """alignment string for the query 508 """ 509 self._hsp.query = self._value
510
511 - def _end_Hsp_hseq(self):
512 """alignment string for the database 513 """ 514 self._hsp.sbjct = self._value
515
516 - def _end_Hsp_midline(self):
517 """Formatting middle line as normally seen in BLAST report 518 """ 519 self._hsp.match = self._value # do NOT strip spaces! 520 assert len(self._hsp.match)==len(self._hsp.query) 521 assert len(self._hsp.match)==len(self._hsp.sbjct)
522 523 # Statistics
524 - def _end_Statistics_db_num(self):
525 """number of sequences in the database 526 """ 527 self._blast.num_sequences_in_database = int(self._value)
528
529 - def _end_Statistics_db_len(self):
530 """number of letters in the database 531 """ 532 self._blast.num_letters_in_database = int(self._value)
533
534 - def _end_Statistics_hsp_len(self):
535 """the effective HSP length 536 """ 537 self._blast.effective_hsp_length = int(self._value)
538
540 """the effective search space 541 """ 542 self._blast.effective_search_space = float(self._value)
543
544 - def _end_Statistics_kappa(self):
545 """Karlin-Altschul parameter K 546 """ 547 self._blast.ka_params = float(self._value)
548
549 - def _end_Statistics_lambda(self):
550 """Karlin-Altschul parameter Lambda 551 """ 552 self._blast.ka_params = (float(self._value), 553 self._blast.ka_params)
554
555 - def _end_Statistics_entropy(self):
556 """Karlin-Altschul parameter H 557 """ 558 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
559
560 -def read(handle, debug=0):
561 """Returns a single Blast record (assumes just one query). 562 563 This function is for use when there is one and only one BLAST 564 result in your XML file. 565 566 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 567 one BLAST record (i.e. if you have more than one query sequence). 568 569 """ 570 iterator = parse(handle, debug) 571 try : 572 first = iterator.next() 573 except StopIteration : 574 first = None 575 if first is None : 576 raise ValueError("No records found in handle") 577 try : 578 second = iterator.next() 579 except StopIteration : 580 second = None 581 if second is not None : 582 raise ValueError("More than one record found in handle") 583 return first
584 585
586 -def parse(handle, debug=0):
587 """Returns an iterator a Blast record for each query. 588 589 handle - file handle to and XML file to parse 590 debug - integer, amount of debug information to print 591 592 This is a generator function that returns multiple Blast records 593 objects - one for each query sequence given to blast. The file 594 is read incrementally, returning complete records as they are read 595 in. 596 597 Should cope with new BLAST 2.2.14+ which gives a single XML file 598 for mutliple query records. 599 600 Should also cope with XML output from older versions BLAST which 601 gave multiple XML files concatenated together (giving a single file 602 which strictly speaking wasn't valid XML).""" 603 from xml.parsers import expat 604 BLOCK = 1024 605 MARGIN = 10 # must be at least length of newline + XML start 606 XML_START = "<?xml" 607 608 text = handle.read(BLOCK) 609 pending = "" 610 611 if not text : 612 #NO DATA FOUND! 613 raise ValueError("Your XML file was empty") 614 615 while text : 616 #We are now starting a new XML file 617 if not text.startswith(XML_START) : 618 raise ValueError("Your XML file did not start with %s..." \ 619 % XML_START) 620 621 expat_parser = expat.ParserCreate() 622 blast_parser = BlastParser(debug) 623 expat_parser.StartElementHandler = blast_parser.startElement 624 expat_parser.EndElementHandler = blast_parser.endElement 625 expat_parser.CharacterDataHandler = blast_parser.characters 626 627 expat_parser.Parse(text, False) 628 while blast_parser._records: 629 record = blast_parser._records[0] 630 blast_parser._records = blast_parser._records[1:] 631 yield record 632 633 while True : 634 #Read in another block of the file... 635 text, pending = pending + handle.read(BLOCK), "" 636 if not text: 637 #End of the file! 638 expat_parser.Parse("", True) # End of XML record 639 break 640 641 #Now read a little bit more so we can check for the 642 #start of another XML file... 643 pending = handle.read(MARGIN) 644 645 if (text+pending).find("\n" + XML_START) == -1 : 646 # Good - still dealing with the same XML file 647 expat_parser.Parse(text, False) 648 while blast_parser._records: 649 record = blast_parser._records[0] 650 blast_parser._records = blast_parser._records[1:] 651 yield record 652 else : 653 # This is output from pre 2.2.14 BLAST, 654 # one XML file for each query! 655 656 # Finish the old file: 657 text, pending = (text+pending).split("\n" + XML_START,1) 658 pending = XML_START + pending 659 660 expat_parser.Parse(text, True) # End of XML record 661 while blast_parser._records: 662 record = blast_parser._records[0] 663 blast_parser._records = blast_parser._records[1:] 664 yield record 665 666 #Now we are going to re-loop, reset the 667 #parsers and start reading the next XML file 668 text, pending = pending, "" 669 break 670 671 #At this point we have finished the first XML record. 672 #If the file is from an old version of blast, it may 673 #contain more XML records (check if text==""). 674 assert pending=="" 675 assert len(blast_parser._records) == 0 676 677 #We should have finished the file! 678 assert text=="" 679 assert pending=="" 680 assert len(blast_parser._records) == 0
681 682 if __name__ == '__main__': 683 import sys 684 import os 685 handle = open(sys.argv[1]) 686 r_list = parse(handle) 687 688 for r in r_list : 689 # Small test 690 print 'Blast of', r.query 691 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments), 692 reduce(lambda a,b: a+b, 693 [len(a.hsps) for a in r.alignments])) 694 695 for al in r.alignments: 696 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs' 697 698 # Cookbook example 699 E_VALUE_THRESH = 0.04 700 for alignment in r.alignments: 701 for hsp in alignment.hsps: 702 if hsp.expect < E_VALUE_THRESH: 703 print '*****' 704 print 'sequence', alignment.title 705 print 'length', alignment.length 706 print 'e value', hsp.expect 707 print hsp.query[:75] + '...' 708 print hsp.match[:75] + '...' 709 print hsp.sbjct[:75] + '...' 710