Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """ 
   9  This module provides code to work with the standalone version of 
  10  BLAST, either blastall or blastpgp, provided by the NCBI. 
  11  http://www.ncbi.nlm.nih.gov/BLAST/ 
  12   
  13  Classes: 
  14  LowQualityBlastError     Except that indicates low quality query sequences. 
  15  BlastParser              Parses output from blast. 
  16  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  17  PSIBlastParser           Parses output from psi-blast. 
  18  Iterator                 Iterates over a file of blast results. 
  19   
  20  _Scanner                 Scans output from standalone BLAST. 
  21  _BlastConsumer           Consumes output from blast. 
  22  _PSIBlastConsumer        Consumes output from psi-blast. 
  23  _HeaderConsumer          Consumes header information. 
  24  _DescriptionConsumer     Consumes description information. 
  25  _AlignmentConsumer       Consumes alignment information. 
  26  _HSPConsumer             Consumes hsp information. 
  27  _DatabaseReportConsumer  Consumes database report information. 
  28  _ParametersConsumer      Consumes parameters information. 
  29   
  30  Functions: 
  31  blastall        Execute blastall. 
  32  blastpgp        Execute blastpgp. 
  33  rpsblast        Execute rpsblast. 
  34   
  35  """ 
  36   
  37  import os 
  38  import re 
  39   
  40  from Bio import File 
  41  from Bio.ParserSupport import * 
  42  from Bio.Blast import Record 
  43  from Bio.Application import _escape_filename 
  44   
45 -class LowQualityBlastError(Exception):
46 """Error caused by running a low quality sequence through BLAST. 47 48 When low quality sequences (like GenBank entries containing only 49 stretches of a single nucleotide) are BLASTed, they will result in 50 BLAST generating an error and not being able to perform the BLAST. 51 search. This error should be raised for the BLAST reports produced 52 in this case. 53 """ 54 pass
55
56 -class ShortQueryBlastError(Exception):
57 """Error caused by running a short query sequence through BLAST. 58 59 If the query sequence is too short, BLAST outputs warnings and errors: 60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 61 [blastall] ERROR: [000.000] AT1G08320: Blast: 62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 63 done 64 65 This exception is raised when that condition is detected. 66 67 """ 68 pass
69 70
71 -class _Scanner:
72 """Scan BLAST output from blastall or blastpgp. 73 74 Tested with blastall and blastpgp v2.0.10, v2.0.11 75 76 Methods: 77 feed Feed data into the scanner. 78 79 """
80 - def feed(self, handle, consumer):
81 """S.feed(handle, consumer) 82 83 Feed in a BLAST report for scanning. handle is a file-like 84 object that contains the BLAST report. consumer is a Consumer 85 object that will receive events as the report is scanned. 86 87 """ 88 if isinstance(handle, File.UndoHandle): 89 uhandle = handle 90 else: 91 uhandle = File.UndoHandle(handle) 92 93 # Try to fast-forward to the beginning of the blast report. 94 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 95 # Now scan the BLAST report. 96 self._scan_header(uhandle, consumer) 97 self._scan_rounds(uhandle, consumer) 98 self._scan_database_report(uhandle, consumer) 99 self._scan_parameters(uhandle, consumer)
100
101 - def _scan_header(self, uhandle, consumer):
102 # BLASTP 2.0.10 [Aug-26-1999] 103 # 104 # 105 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 106 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 107 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 108 # programs", Nucleic Acids Res. 25:3389-3402. 109 # 110 # Query= test 111 # (140 letters) 112 # 113 # Database: sdqib40-1.35.seg.fa 114 # 1323 sequences; 223,339 total letters 115 # 116 # ======================================================== 117 # This next example is from the online version of Blast, 118 # note there are TWO references, an RID line, and also 119 # the database is BEFORE the query line. 120 # Note there possibleuse of non-ASCII in the author names. 121 # ======================================================== 122 # 123 # BLASTP 2.2.15 [Oct-15-2006] 124 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 125 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 126 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 127 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 128 # 129 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 130 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 131 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 132 # protein database searches with composition-based statistics 133 # and other refinements", Nucleic Acids Res. 29:2994-3005. 134 # 135 # RID: 1166022616-19998-65316425856.BLASTQ1 136 # 137 # 138 # Database: All non-redundant GenBank CDS 139 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 140 # 4,254,166 sequences; 1,462,033,012 total letters 141 # Query= gi:16127998 142 # Length=428 143 # 144 145 consumer.start_header() 146 147 read_and_call(uhandle, consumer.version, contains='BLAST') 148 read_and_call_while(uhandle, consumer.noevent, blank=1) 149 150 # There might be a <pre> line, for qblast output. 151 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 152 153 # Read the reference(s) 154 while attempt_read_and_call(uhandle, 155 consumer.reference, start='Reference') : 156 # References are normally multiline terminated by a blank line 157 # (or, based on the old code, the RID line) 158 while 1: 159 line = uhandle.readline() 160 if is_blank_line(line) : 161 consumer.noevent(line) 162 break 163 elif line.startswith("RID"): 164 break 165 else : 166 #More of the reference 167 consumer.reference(line) 168 169 #Deal with the optional RID: ... 170 read_and_call_while(uhandle, consumer.noevent, blank=1) 171 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 172 read_and_call_while(uhandle, consumer.noevent, blank=1) 173 174 # blastpgp may have a reference for compositional score matrix 175 # adjustment (see Bug 2502): 176 if attempt_read_and_call( 177 uhandle, consumer.reference, start="Reference"): 178 read_and_call_until(uhandle, consumer.reference, blank=1) 179 read_and_call_while(uhandle, consumer.noevent, blank=1) 180 181 # blastpgp has a Reference for composition-based statistics. 182 if attempt_read_and_call( 183 uhandle, consumer.reference, start="Reference"): 184 read_and_call_until(uhandle, consumer.reference, blank=1) 185 read_and_call_while(uhandle, consumer.noevent, blank=1) 186 187 line = uhandle.peekline() 188 assert line.strip() != "" 189 assert not line.startswith("RID:") 190 if line.startswith("Query=") : 191 #This is an old style query then database... 192 193 # Read the Query lines and the following blank line. 194 read_and_call(uhandle, consumer.query_info, start='Query=') 195 read_and_call_until(uhandle, consumer.query_info, blank=1) 196 read_and_call_while(uhandle, consumer.noevent, blank=1) 197 198 # Read the database lines and the following blank line. 199 read_and_call_until(uhandle, consumer.database_info, end='total letters') 200 read_and_call(uhandle, consumer.database_info, contains='sequences') 201 read_and_call_while(uhandle, consumer.noevent, blank=1) 202 elif line.startswith("Database:") : 203 #This is a new style database then query... 204 read_and_call_until(uhandle, consumer.database_info, end='total letters') 205 read_and_call(uhandle, consumer.database_info, contains='sequences') 206 read_and_call_while(uhandle, consumer.noevent, blank=1) 207 208 # Read the Query lines and the following blank line. 209 read_and_call(uhandle, consumer.query_info, start='Query=') 210 read_and_call_until(uhandle, consumer.query_info, blank=1) 211 read_and_call_while(uhandle, consumer.noevent, blank=1) 212 else : 213 raise ValueError("Invalid header?") 214 215 consumer.end_header()
216
217 - def _scan_rounds(self, uhandle, consumer):
218 # Scan a bunch of rounds. 219 # Each round begins with either a "Searching......" line 220 # or a 'Score E' line followed by descriptions and alignments. 221 # The email server doesn't give the "Searching....." line. 222 # If there is no 'Searching.....' line then you'll first see a 223 # 'Results from round' line 224 225 while 1: 226 line = safe_peekline(uhandle) 227 if (not line.startswith('Searching') and 228 not line.startswith('Results from round') and 229 re.search(r"Score +E", line) is None and 230 line.find('No hits found') == -1): 231 break 232 233 self._scan_descriptions(uhandle, consumer) 234 self._scan_alignments(uhandle, consumer)
235
236 - def _scan_descriptions(self, uhandle, consumer):
237 # Searching..................................................done 238 # Results from round 2 239 # 240 # 241 # Sc 242 # Sequences producing significant alignments: (b 243 # Sequences used in model and found again: 244 # 245 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 246 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 247 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 248 # 249 # Sequences not found previously or not previously below threshold: 250 # 251 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 252 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 253 # 254 255 # If PSI-BLAST, may also have: 256 # 257 # CONVERGED! 258 259 consumer.start_descriptions() 260 261 # Read 'Searching' 262 # This line seems to be missing in BLASTN 2.1.2 (others?) 263 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 264 265 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 266 # If this happens, the handle will yield no more information. 267 if not uhandle.peekline(): 268 raise ValueError("Unexpected end of blast report. " + \ 269 "Looks suspiciously like a PSI-BLAST crash.") 270 271 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 272 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 273 # [blastall] ERROR: [000.000] AT1G08320: Blast: 274 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 275 # done 276 # Reported by David Weisman. 277 # Check for these error lines and ignore them for now. Let 278 # the BlastErrorParser deal with them. 279 line = uhandle.peekline() 280 if line.find("ERROR:") != -1 or line.startswith("done"): 281 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 282 read_and_call(uhandle, consumer.noevent, start="done") 283 284 # Check to see if this is PSI-BLAST. 285 # If it is, the 'Searching' line will be followed by: 286 # (version 2.0.10) 287 # Searching............................. 288 # Results from round 2 289 # or (version 2.0.11) 290 # Searching............................. 291 # 292 # 293 # Results from round 2 294 295 # Skip a bunch of blank lines. 296 read_and_call_while(uhandle, consumer.noevent, blank=1) 297 # Check for the results line if it's there. 298 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 299 read_and_call_while(uhandle, consumer.noevent, blank=1) 300 301 # Three things can happen here: 302 # 1. line contains 'Score E' 303 # 2. line contains "No hits found" 304 # 3. no descriptions 305 # The first one begins a bunch of descriptions. The last two 306 # indicates that no descriptions follow, and we should go straight 307 # to the alignments. 308 if not attempt_read_and_call( 309 uhandle, consumer.description_header, 310 has_re=re.compile(r'Score +E')): 311 # Either case 2 or 3. Look for "No hits found". 312 attempt_read_and_call(uhandle, consumer.no_hits, 313 contains='No hits found') 314 read_and_call_while(uhandle, consumer.noevent, blank=1) 315 316 consumer.end_descriptions() 317 # Stop processing. 318 return 319 320 # Read the score header lines 321 read_and_call(uhandle, consumer.description_header, 322 start='Sequences producing') 323 324 # If PSI-BLAST, read the 'Sequences used in model' line. 325 attempt_read_and_call(uhandle, consumer.model_sequences, 326 start='Sequences used in model') 327 read_and_call_while(uhandle, consumer.noevent, blank=1) 328 329 # In BLAT, rather than a "No hits found" line, we just 330 # get no descriptions (and no alignments). This can be 331 # spotted because the next line is the database block: 332 if safe_peekline(uhandle).startswith(" Database:") : 333 consumer.end_descriptions() 334 # Stop processing. 335 return 336 337 # Read the descriptions and the following blank lines, making 338 # sure that there are descriptions. 339 if not uhandle.peekline().startswith('Sequences not found'): 340 read_and_call_until(uhandle, consumer.description, blank=1) 341 read_and_call_while(uhandle, consumer.noevent, blank=1) 342 343 # If PSI-BLAST, read the 'Sequences not found' line followed 344 # by more descriptions. However, I need to watch out for the 345 # case where there were no sequences not found previously, in 346 # which case there will be no more descriptions. 347 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 348 start='Sequences not found'): 349 # Read the descriptions and the following blank lines. 350 read_and_call_while(uhandle, consumer.noevent, blank=1) 351 l = safe_peekline(uhandle) 352 # Brad -- added check for QUERY. On some PSI-BLAST outputs 353 # there will be a 'Sequences not found' line followed by no 354 # descriptions. Check for this case since the first thing you'll 355 # get is a blank line and then 'QUERY' 356 if not l.startswith('CONVERGED') and l[0] != '>' \ 357 and not l.startswith('QUERY'): 358 read_and_call_until(uhandle, consumer.description, blank=1) 359 read_and_call_while(uhandle, consumer.noevent, blank=1) 360 361 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 362 read_and_call_while(uhandle, consumer.noevent, blank=1) 363 364 consumer.end_descriptions()
365
366 - def _scan_alignments(self, uhandle, consumer):
367 # qblast inserts a helpful line here. 368 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 369 370 # First, check to see if I'm at the database report. 371 line = safe_peekline(uhandle) 372 if line.startswith(' Database'): 373 return 374 elif line[0] == '>': 375 # XXX make a better check here between pairwise and masterslave 376 self._scan_pairwise_alignments(uhandle, consumer) 377 else: 378 # XXX put in a check to make sure I'm in a masterslave alignment 379 self._scan_masterslave_alignment(uhandle, consumer)
380
381 - def _scan_pairwise_alignments(self, uhandle, consumer):
382 while 1: 383 line = safe_peekline(uhandle) 384 if line[0] != '>': 385 break 386 self._scan_one_pairwise_alignment(uhandle, consumer)
387
388 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
389 consumer.start_alignment() 390 391 self._scan_alignment_header(uhandle, consumer) 392 393 # Scan a bunch of score/alignment pairs. 394 while 1: 395 line = safe_peekline(uhandle) 396 if not line.startswith(' Score'): 397 break 398 self._scan_hsp(uhandle, consumer) 399 consumer.end_alignment()
400
401 - def _scan_alignment_header(self, uhandle, consumer):
402 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 403 # stearothermophilus] 404 # Length = 81 405 # 406 # Or, more recently with different white space: 407 # 408 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 409 # gi|15829258|ref|NP_308031.1| threonine synthase 410 # ... 411 # Length=428 412 read_and_call(uhandle, consumer.title, start='>') 413 while 1: 414 line = safe_readline(uhandle) 415 if line.lstrip().startswith('Length =') \ 416 or line.lstrip().startswith('Length='): 417 consumer.length(line) 418 break 419 elif is_blank_line(line): 420 # Check to make sure I haven't missed the Length line 421 raise ValueError("I missed the Length in an alignment header") 422 consumer.title(line) 423 424 # Older versions of BLAST will have a line with some spaces. 425 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 426 if not attempt_read_and_call(uhandle, consumer.noevent, 427 start=' '): 428 read_and_call(uhandle, consumer.noevent, blank=1)
429
430 - def _scan_hsp(self, uhandle, consumer):
431 consumer.start_hsp() 432 self._scan_hsp_header(uhandle, consumer) 433 self._scan_hsp_alignment(uhandle, consumer) 434 consumer.end_hsp()
435
436 - def _scan_hsp_header(self, uhandle, consumer):
437 # Score = 22.7 bits (47), Expect = 2.5 438 # Identities = 10/36 (27%), Positives = 18/36 (49%) 439 # Strand = Plus / Plus 440 # Frame = +3 441 # 442 443 read_and_call(uhandle, consumer.score, start=' Score') 444 read_and_call(uhandle, consumer.identities, start=' Identities') 445 # BLASTN 446 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 447 # BLASTX, TBLASTN, TBLASTX 448 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 449 read_and_call(uhandle, consumer.noevent, blank=1)
450
451 - def _scan_hsp_alignment(self, uhandle, consumer):
452 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 453 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 454 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 455 # 456 # Query: 64 AEKILIKR 71 457 # I +K 458 # Sbjct: 70 PNIIQLKD 77 459 # 460 461 while 1: 462 # Blastn adds an extra line filled with spaces before Query 463 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 464 read_and_call(uhandle, consumer.query, start='Query') 465 read_and_call(uhandle, consumer.align, start=' ') 466 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 467 read_and_call_while(uhandle, consumer.noevent, blank=1) 468 line = safe_peekline(uhandle) 469 # Alignment continues if I see a 'Query' or the spaces for Blastn. 470 if not (line.startswith('Query') or line.startswith(' ')): 471 break
472
473 - def _scan_masterslave_alignment(self, uhandle, consumer):
474 consumer.start_alignment() 475 while 1: 476 line = safe_readline(uhandle) 477 # Check to see whether I'm finished reading the alignment. 478 # This is indicated by 1) database section, 2) next psi-blast 479 # round, which can also be a 'Results from round' if no 480 # searching line is present 481 # patch by chapmanb 482 if line.startswith('Searching') or \ 483 line.startswith('Results from round'): 484 uhandle.saveline(line) 485 break 486 elif line.startswith(' Database'): 487 uhandle.saveline(line) 488 break 489 elif is_blank_line(line): 490 consumer.noevent(line) 491 else: 492 consumer.multalign(line) 493 read_and_call_while(uhandle, consumer.noevent, blank=1) 494 consumer.end_alignment()
495
496 - def _scan_database_report(self, uhandle, consumer):
497 # Database: sdqib40-1.35.seg.fa 498 # Posted date: Nov 1, 1999 4:25 PM 499 # Number of letters in database: 223,339 500 # Number of sequences in database: 1323 501 # 502 # Lambda K H 503 # 0.322 0.133 0.369 504 # 505 # Gapped 506 # Lambda K H 507 # 0.270 0.0470 0.230 508 # 509 ########################################## 510 # Or, more recently Blast 2.2.15 gives less blank lines 511 ########################################## 512 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 513 # environmental samples 514 # Posted date: Dec 12, 2006 5:51 PM 515 # Number of letters in database: 667,088,753 516 # Number of sequences in database: 2,094,974 517 # Lambda K H 518 # 0.319 0.136 0.395 519 # Gapped 520 # Lambda K H 521 # 0.267 0.0410 0.140 522 523 524 consumer.start_database_report() 525 526 # Subset of the database(s) listed below 527 # Number of letters searched: 562,618,960 528 # Number of sequences searched: 228,924 529 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 530 read_and_call(uhandle, consumer.noevent, contains="letters") 531 read_and_call(uhandle, consumer.noevent, contains="sequences") 532 read_and_call(uhandle, consumer.noevent, start=" ") 533 534 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 535 # was missing the "Database" stanza completely. 536 while attempt_read_and_call(uhandle, consumer.database, 537 start=' Database'): 538 # BLAT output ends abruptly here, without any of the other 539 # information. Check to see if this is the case. If so, 540 # then end the database report here gracefully. 541 if not uhandle.peekline().strip() \ 542 or uhandle.peekline().startswith("BLAST"): 543 consumer.end_database_report() 544 return 545 546 # Database can span multiple lines. 547 read_and_call_until(uhandle, consumer.database, start=' Posted') 548 read_and_call(uhandle, consumer.posted_date, start=' Posted') 549 read_and_call(uhandle, consumer.num_letters_in_database, 550 start=' Number of letters') 551 read_and_call(uhandle, consumer.num_sequences_in_database, 552 start=' Number of sequences') 553 #There may not be a line starting with spaces... 554 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 555 556 line = safe_readline(uhandle) 557 uhandle.saveline(line) 558 if line.find('Lambda') != -1: 559 break 560 561 read_and_call(uhandle, consumer.noevent, start='Lambda') 562 read_and_call(uhandle, consumer.ka_params) 563 564 #This blank line is optional: 565 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 566 567 # not BLASTP 568 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 569 # not TBLASTX 570 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 571 read_and_call(uhandle, consumer.ka_params_gap) 572 573 # Blast 2.2.4 can sometimes skip the whole parameter section. 574 # Thus, I need to be careful not to read past the end of the 575 # file. 576 try: 577 read_and_call_while(uhandle, consumer.noevent, blank=1) 578 except ValueError, x: 579 if str(x) != "Unexpected end of stream.": 580 raise 581 consumer.end_database_report()
582
583 - def _scan_parameters(self, uhandle, consumer):
584 # Matrix: BLOSUM62 585 # Gap Penalties: Existence: 11, Extension: 1 586 # Number of Hits to DB: 50604 587 # Number of Sequences: 1323 588 # Number of extensions: 1526 589 # Number of successful extensions: 6 590 # Number of sequences better than 10.0: 5 591 # Number of HSP's better than 10.0 without gapping: 5 592 # Number of HSP's successfully gapped in prelim test: 0 593 # Number of HSP's that attempted gapping in prelim test: 1 594 # Number of HSP's gapped (non-prelim): 5 595 # length of query: 140 596 # length of database: 223,339 597 # effective HSP length: 39 598 # effective length of query: 101 599 # effective length of database: 171,742 600 # effective search space: 17345942 601 # effective search space used: 17345942 602 # T: 11 603 # A: 40 604 # X1: 16 ( 7.4 bits) 605 # X2: 38 (14.8 bits) 606 # X3: 64 (24.9 bits) 607 # S1: 41 (21.9 bits) 608 # S2: 42 (20.8 bits) 609 ########################################## 610 # Or, more recently Blast(x) 2.2.15 gives 611 ########################################## 612 # Matrix: BLOSUM62 613 # Gap Penalties: Existence: 11, Extension: 1 614 # Number of Sequences: 4535438 615 # Number of Hits to DB: 2,588,844,100 616 # Number of extensions: 60427286 617 # Number of successful extensions: 126433 618 # Number of sequences better than 2.0: 30 619 # Number of HSP's gapped: 126387 620 # Number of HSP's successfully gapped: 35 621 # Length of query: 291 622 # Length of database: 1,573,298,872 623 # Length adjustment: 130 624 # Effective length of query: 161 625 # Effective length of database: 983,691,932 626 # Effective search space: 158374401052 627 # Effective search space used: 158374401052 628 # Neighboring words threshold: 12 629 # Window for multiple hits: 40 630 # X1: 16 ( 7.3 bits) 631 # X2: 38 (14.6 bits) 632 # X3: 64 (24.7 bits) 633 # S1: 41 (21.7 bits) 634 # S2: 32 (16.9 bits) 635 636 637 # Blast 2.2.4 can sometimes skip the whole parameter section. 638 # BLAT also skips the whole parameter section. 639 # Thus, check to make sure that the parameter section really 640 # exists. 641 if not uhandle.peekline().strip(): 642 return 643 644 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 645 # "Number of Sequences" lines. 646 consumer.start_parameters() 647 648 # Matrix line may be missing in BLASTN 2.2.9 649 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 650 # not TBLASTX 651 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 652 653 attempt_read_and_call(uhandle, consumer.num_sequences, 654 start='Number of Sequences') 655 read_and_call(uhandle, consumer.num_hits, 656 start='Number of Hits') 657 attempt_read_and_call(uhandle, consumer.num_sequences, 658 start='Number of Sequences') 659 read_and_call(uhandle, consumer.num_extends, 660 start='Number of extensions') 661 read_and_call(uhandle, consumer.num_good_extends, 662 start='Number of successful') 663 664 read_and_call(uhandle, consumer.num_seqs_better_e, 665 start='Number of sequences') 666 667 # not BLASTN, TBLASTX 668 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 669 start="Number of HSP's better"): 670 # BLASTN 2.2.9 671 if attempt_read_and_call(uhandle, consumer.noevent, 672 start="Number of HSP's gapped:"): 673 read_and_call(uhandle, consumer.noevent, 674 start="Number of HSP's successfully") 675 #This is ommitted in 2.2.15 676 attempt_read_and_call(uhandle, consumer.noevent, 677 start="Number of extra gapped extensions") 678 else: 679 read_and_call(uhandle, consumer.hsps_prelim_gapped, 680 start="Number of HSP's successfully") 681 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 682 start="Number of HSP's that") 683 read_and_call(uhandle, consumer.hsps_gapped, 684 start="Number of HSP's gapped") 685 #e.g. BLASTX 2.2.15 where the "better" line is missing 686 elif attempt_read_and_call(uhandle, consumer.noevent, 687 start="Number of HSP's gapped"): 688 read_and_call(uhandle, consumer.noevent, 689 start="Number of HSP's successfully") 690 691 # not in blastx 2.2.1 692 attempt_read_and_call(uhandle, consumer.query_length, 693 has_re=re.compile(r"[Ll]ength of query")) 694 read_and_call(uhandle, consumer.database_length, 695 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 696 697 # BLASTN 2.2.9 698 attempt_read_and_call(uhandle, consumer.noevent, 699 start="Length adjustment") 700 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 701 start='effective HSP') 702 # Not in blastx 2.2.1 703 attempt_read_and_call( 704 uhandle, consumer.effective_query_length, 705 has_re=re.compile(r'[Ee]ffective length of query')) 706 707 # This is not in BLASTP 2.2.15 708 attempt_read_and_call( 709 uhandle, consumer.effective_database_length, 710 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 711 # Not in blastx 2.2.1, added a ':' to distinguish between 712 # this and the 'effective search space used' line 713 attempt_read_and_call( 714 uhandle, consumer.effective_search_space, 715 has_re=re.compile(r'[Ee]ffective search space:')) 716 # Does not appear in BLASTP 2.0.5 717 attempt_read_and_call( 718 uhandle, consumer.effective_search_space_used, 719 has_re=re.compile(r'[Ee]ffective search space used')) 720 721 # BLASTX, TBLASTN, TBLASTX 722 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 723 724 # not in BLASTN 2.2.9 725 attempt_read_and_call(uhandle, consumer.threshold, start='T') 726 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 727 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 728 729 # not in BLASTX 2.2.15 730 attempt_read_and_call(uhandle, consumer.window_size, start='A') 731 # get this instead: "Window for multiple hits: 40" 732 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 733 734 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 735 # not TBLASTN 736 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 737 738 # not BLASTN, TBLASTX 739 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 740 start='X3') 741 742 # not TBLASTN 743 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 744 # not in blastx 2.2.1 745 # first we make sure we have additional lines to work with, if 746 # not then the file is done and we don't have a final S2 747 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 748 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 749 750 consumer.end_parameters()
751
752 -class BlastParser(AbstractParser):
753 """Parses BLAST data into a Record.Blast object. 754 755 """
756 - def __init__(self):
757 """__init__(self)""" 758 self._scanner = _Scanner() 759 self._consumer = _BlastConsumer()
760
761 - def parse(self, handle):
762 """parse(self, handle)""" 763 self._scanner.feed(handle, self._consumer) 764 return self._consumer.data
765
766 -class PSIBlastParser(AbstractParser):
767 """Parses BLAST data into a Record.PSIBlast object. 768 769 """
770 - def __init__(self):
771 """__init__(self)""" 772 self._scanner = _Scanner() 773 self._consumer = _PSIBlastConsumer()
774
775 - def parse(self, handle):
776 """parse(self, handle)""" 777 self._scanner.feed(handle, self._consumer) 778 return self._consumer.data
779
780 -class _HeaderConsumer:
781 - def start_header(self):
782 self._header = Record.Header()
783
784 - def version(self, line):
785 c = line.split() 786 self._header.application = c[0] 787 self._header.version = c[1] 788 self._header.date = c[2][1:-1]
789
790 - def reference(self, line):
791 if line.startswith('Reference: '): 792 self._header.reference = line[11:] 793 else: 794 self._header.reference = self._header.reference + line
795
796 - def query_info(self, line):
797 if line.startswith('Query= '): 798 self._header.query = line[7:] 799 elif not line.startswith(' '): # continuation of query_info 800 self._header.query = "%s%s" % (self._header.query, line) 801 else: 802 letters, = _re_search( 803 r"([0-9,]+) letters", line, 804 "I could not find the number of letters in line\n%s" % line) 805 self._header.query_letters = _safe_int(letters)
806
807 - def database_info(self, line):
808 line = line.rstrip() 809 if line.startswith('Database: '): 810 self._header.database = line[10:] 811 elif not line.endswith('total letters'): 812 self._header.database = self._header.database + line.strip() 813 else: 814 sequences, letters =_re_search( 815 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 816 "I could not find the sequences and letters in line\n%s" %line) 817 self._header.database_sequences = _safe_int(sequences) 818 self._header.database_letters = _safe_int(letters)
819
820 - def end_header(self):
821 # Get rid of the trailing newlines 822 self._header.reference = self._header.reference.rstrip() 823 self._header.query = self._header.query.rstrip()
824
825 -class _DescriptionConsumer:
826 - def start_descriptions(self):
827 self._descriptions = [] 828 self._model_sequences = [] 829 self._nonmodel_sequences = [] 830 self._converged = 0 831 self._type = None 832 self._roundnum = None 833 834 self.__has_n = 0 # Does the description line contain an N value?
835
836 - def description_header(self, line):
837 if line.startswith('Sequences producing'): 838 cols = line.split() 839 if cols[-1] == 'N': 840 self.__has_n = 1
841
842 - def description(self, line):
843 dh = self._parse(line) 844 if self._type == 'model': 845 self._model_sequences.append(dh) 846 elif self._type == 'nonmodel': 847 self._nonmodel_sequences.append(dh) 848 else: 849 self._descriptions.append(dh)
850
851 - def model_sequences(self, line):
852 self._type = 'model'
853
854 - def nonmodel_sequences(self, line):
855 self._type = 'nonmodel'
856
857 - def converged(self, line):
858 self._converged = 1
859
860 - def no_hits(self, line):
861 pass
862
863 - def round(self, line):
864 if not line.startswith('Results from round'): 865 raise ValueError("I didn't understand the round line\n%s" % line) 866 self._roundnum = _safe_int(line[18:].strip())
867
868 - def end_descriptions(self):
869 pass
870
871 - def _parse(self, description_line):
872 line = description_line # for convenience 873 dh = Record.Description() 874 875 # I need to separate the score and p-value from the title. 876 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 877 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 878 # special cases to handle: 879 # - title must be preserved exactly (including whitespaces) 880 # - score could be equal to e-value (not likely, but what if??) 881 # - sometimes there's an "N" score of '1'. 882 cols = line.split() 883 if len(cols) < 3: 884 raise ValueError( \ 885 "Line does not appear to contain description:\n%s" % line) 886 if self.__has_n: 887 i = line.rfind(cols[-1]) # find start of N 888 i = line.rfind(cols[-2], 0, i) # find start of p-value 889 i = line.rfind(cols[-3], 0, i) # find start of score 890 else: 891 i = line.rfind(cols[-1]) # find start of p-value 892 i = line.rfind(cols[-2], 0, i) # find start of score 893 if self.__has_n: 894 dh.title, dh.score, dh.e, dh.num_alignments = \ 895 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 896 else: 897 dh.title, dh.score, dh.e, dh.num_alignments = \ 898 line[:i].rstrip(), cols[-2], cols[-1], 1 899 dh.num_alignments = _safe_int(dh.num_alignments) 900 dh.score = _safe_int(dh.score) 901 dh.e = _safe_float(dh.e) 902 return dh
903
904 -class _AlignmentConsumer:
905 # This is a little bit tricky. An alignment can either be a 906 # pairwise alignment or a multiple alignment. Since it's difficult 907 # to know a-priori which one the blast record will contain, I'm going 908 # to make one class that can parse both of them.
909 - def start_alignment(self):
910 self._alignment = Record.Alignment() 911 self._multiple_alignment = Record.MultipleAlignment()
912
913 - def title(self, line):
914 if self._alignment.title: 915 self._alignment.title += " " 916 self._alignment.title += line.strip()
917
918 - def length(self, line):
919 #e.g. "Length = 81" or more recently, "Length=428" 920 parts = line.replace(" ","").split("=") 921 assert len(parts)==2, "Unrecognised format length line" 922 self._alignment.length = parts[1] 923 self._alignment.length = _safe_int(self._alignment.length)
924
925 - def multalign(self, line):
926 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 927 if line.startswith('QUERY') or line.startswith('blast_tmp'): 928 # If this is the first line of the multiple alignment, 929 # then I need to figure out how the line is formatted. 930 931 # Format of line is: 932 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 933 try: 934 name, start, seq, end = line.split() 935 except ValueError: 936 raise ValueError("I do not understand the line\n%s" % line) 937 self._start_index = line.index(start, len(name)) 938 self._seq_index = line.index(seq, 939 self._start_index+len(start)) 940 # subtract 1 for the space 941 self._name_length = self._start_index - 1 942 self._start_length = self._seq_index - self._start_index - 1 943 self._seq_length = line.rfind(end) - self._seq_index - 1 944 945 #self._seq_index = line.index(seq) 946 ## subtract 1 for the space 947 #self._seq_length = line.rfind(end) - self._seq_index - 1 948 #self._start_index = line.index(start) 949 #self._start_length = self._seq_index - self._start_index - 1 950 #self._name_length = self._start_index 951 952 # Extract the information from the line 953 name = line[:self._name_length] 954 name = name.rstrip() 955 start = line[self._start_index:self._start_index+self._start_length] 956 start = start.rstrip() 957 if start: 958 start = _safe_int(start) 959 end = line[self._seq_index+self._seq_length:].rstrip() 960 if end: 961 end = _safe_int(end) 962 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 963 # right pad the sequence with spaces if necessary 964 if len(seq) < self._seq_length: 965 seq = seq + ' '*(self._seq_length-len(seq)) 966 967 # I need to make sure the sequence is aligned correctly with the query. 968 # First, I will find the length of the query. Then, if necessary, 969 # I will pad my current sequence with spaces so that they will line 970 # up correctly. 971 972 # Two possible things can happen: 973 # QUERY 974 # 504 975 # 976 # QUERY 977 # 403 978 # 979 # Sequence 504 will need padding at the end. Since I won't know 980 # this until the end of the alignment, this will be handled in 981 # end_alignment. 982 # Sequence 403 will need padding before being added to the alignment. 983 984 align = self._multiple_alignment.alignment # for convenience 985 align.append((name, start, seq, end))
986 987 # This is old code that tried to line up all the sequences 988 # in a multiple alignment by using the sequence title's as 989 # identifiers. The problem with this is that BLAST assigns 990 # different HSP's from the same sequence the same id. Thus, 991 # in one alignment block, there may be multiple sequences with 992 # the same id. I'm not sure how to handle this, so I'm not 993 # going to. 994 995 # # If the sequence is the query, then just add it. 996 # if name == 'QUERY': 997 # if len(align) == 0: 998 # align.append((name, start, seq)) 999 # else: 1000 # aname, astart, aseq = align[0] 1001 # if name != aname: 1002 # raise ValueError, "Query is not the first sequence" 1003 # aseq = aseq + seq 1004 # align[0] = aname, astart, aseq 1005 # else: 1006 # if len(align) == 0: 1007 # raise ValueError, "I could not find the query sequence" 1008 # qname, qstart, qseq = align[0] 1009 # 1010 # # Now find my sequence in the multiple alignment. 1011 # for i in range(1, len(align)): 1012 # aname, astart, aseq = align[i] 1013 # if name == aname: 1014 # index = i 1015 # break 1016 # else: 1017 # # If I couldn't find it, then add a new one. 1018 # align.append((None, None, None)) 1019 # index = len(align)-1 1020 # # Make sure to left-pad it. 1021 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1022 # 1023 # if len(qseq) != len(aseq) + len(seq): 1024 # # If my sequences are shorter than the query sequence, 1025 # # then I will need to pad some spaces to make them line up. 1026 # # Since I've already right padded seq, that means aseq 1027 # # must be too short. 1028 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1029 # aseq = aseq + seq 1030 # if astart is None: 1031 # astart = start 1032 # align[index] = aname, astart, aseq 1033
1034 - def end_alignment(self):
1035 # Remove trailing newlines 1036 if self._alignment: 1037 self._alignment.title = self._alignment.title.rstrip() 1038 1039 # This code is also obsolete. See note above. 1040 # If there's a multiple alignment, I will need to make sure 1041 # all the sequences are aligned. That is, I may need to 1042 # right-pad the sequences. 1043 # if self._multiple_alignment is not None: 1044 # align = self._multiple_alignment.alignment 1045 # seqlen = None 1046 # for i in range(len(align)): 1047 # name, start, seq = align[i] 1048 # if seqlen is None: 1049 # seqlen = len(seq) 1050 # else: 1051 # if len(seq) < seqlen: 1052 # seq = seq + ' '*(seqlen - len(seq)) 1053 # align[i] = name, start, seq 1054 # elif len(seq) > seqlen: 1055 # raise ValueError, \ 1056 # "Sequence %s is longer than the query" % name 1057 1058 # Clean up some variables, if they exist. 1059 try: 1060 del self._seq_index 1061 del self._seq_length 1062 del self._start_index 1063 del self._start_length 1064 del self._name_length 1065 except AttributeError: 1066 pass
1067
1068 -class _HSPConsumer:
1069 - def start_hsp(self):
1070 self._hsp = Record.HSP()
1071
1072 - def score(self, line):
1073 self._hsp.bits, self._hsp.score = _re_search( 1074 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1075 "I could not find the score in line\n%s" % line) 1076 self._hsp.score = _safe_float(self._hsp.score) 1077 self._hsp.bits = _safe_float(self._hsp.bits) 1078 1079 x, y = _re_search( 1080 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1081 "I could not find the expect in line\n%s" % line) 1082 if x: 1083 self._hsp.num_alignments = _safe_int(x) 1084 else: 1085 self._hsp.num_alignments = 1 1086 self._hsp.expect = _safe_float(y)
1087
1088 - def identities(self, line):
1089 x, y = _re_search( 1090 r"Identities = (\d+)\/(\d+)", line, 1091 "I could not find the identities in line\n%s" % line) 1092 self._hsp.identities = _safe_int(x), _safe_int(y) 1093 self._hsp.align_length = _safe_int(y) 1094 1095 if line.find('Positives') != -1: 1096 x, y = _re_search( 1097 r"Positives = (\d+)\/(\d+)", line, 1098 "I could not find the positives in line\n%s" % line) 1099 self._hsp.positives = _safe_int(x), _safe_int(y) 1100 assert self._hsp.align_length == _safe_int(y) 1101 1102 if line.find('Gaps') != -1: 1103 x, y = _re_search( 1104 r"Gaps = (\d+)\/(\d+)", line, 1105 "I could not find the gaps in line\n%s" % line) 1106 self._hsp.gaps = _safe_int(x), _safe_int(y) 1107 assert self._hsp.align_length == _safe_int(y)
1108 1109
1110 - def strand(self, line):
1111 self._hsp.strand = _re_search( 1112 r"Strand = (\w+) / (\w+)", line, 1113 "I could not find the strand in line\n%s" % line)
1114
1115 - def frame(self, line):
1116 # Frame can be in formats: 1117 # Frame = +1 1118 # Frame = +2 / +2 1119 if line.find('/') != -1: 1120 self._hsp.frame = _re_search( 1121 r"Frame = ([-+][123]) / ([-+][123])", line, 1122 "I could not find the frame in line\n%s" % line) 1123 else: 1124 self._hsp.frame = _re_search( 1125 r"Frame = ([-+][123])", line, 1126 "I could not find the frame in line\n%s" % line)
1127 1128 # Match a space, if one is available. Masahir Ishikawa found a 1129 # case where there's no space between the start and the sequence: 1130 # Query: 100tt 101 1131 # line below modified by Yair Benita, Sep 2004 1132 # Note that the colon is not always present. 2006 1133 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1134 - def query(self, line):
1135 m = self._query_re.search(line) 1136 if m is None: 1137 raise ValueError("I could not find the query in line\n%s" % line) 1138 1139 # line below modified by Yair Benita, Sep 2004. 1140 # added the end attribute for the query 1141 colon, start, seq, end = m.groups() 1142 self._hsp.query = self._hsp.query + seq 1143 if self._hsp.query_start is None: 1144 self._hsp.query_start = _safe_int(start) 1145 1146 # line below added by Yair Benita, Sep 2004. 1147 # added the end attribute for the query 1148 self._hsp.query_end = _safe_int(end) 1149 1150 #Get index for sequence start (regular expression element 3) 1151 self._query_start_index = m.start(3) 1152 self._query_len = len(seq)
1153
1154 - def align(self, line):
1155 seq = line[self._query_start_index:].rstrip() 1156 if len(seq) < self._query_len: 1157 # Make sure the alignment is the same length as the query 1158 seq = seq + ' ' * (self._query_len-len(seq)) 1159 elif len(seq) < self._query_len: 1160 raise ValueError("Match is longer than the query in line\n%s" \ 1161 % line) 1162 self._hsp.match = self._hsp.match + seq
1163 1164 # To match how we do the query, cache the regular expression. 1165 # Note that the colon is not always present. 1166 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1167 - def sbjct(self, line):
1168 m = self._sbjct_re.search(line) 1169 if m is None: 1170 raise ValueError("I could not find the sbjct in line\n%s" % line) 1171 colon, start, seq, end = m.groups() 1172 #mikep 26/9/00 1173 #On occasion, there is a blast hit with no subject match 1174 #so far, it only occurs with 1-line short "matches" 1175 #I have decided to let these pass as they appear 1176 if not seq.strip(): 1177 seq = ' ' * self._query_len 1178 self._hsp.sbjct = self._hsp.sbjct + seq 1179 if self._hsp.sbjct_start is None: 1180 self._hsp.sbjct_start = _safe_int(start) 1181 1182 self._hsp.sbjct_end = _safe_int(end) 1183 if len(seq) != self._query_len: 1184 raise ValueError( \ 1185 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1186 % line) 1187 1188 del self._query_start_index # clean up unused variables 1189 del self._query_len
1190
1191 - def end_hsp(self):
1192 pass
1193
1194 -class _DatabaseReportConsumer:
1195
1196 - def start_database_report(self):
1197 self._dr = Record.DatabaseReport()
1198
1199 - def database(self, line):
1200 m = re.search(r"Database: (.+)$", line) 1201 if m: 1202 self._dr.database_name.append(m.group(1)) 1203 elif self._dr.database_name: 1204 # This must be a continuation of the previous name. 1205 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1206 line.strip())
1207
1208 - def posted_date(self, line):
1209 self._dr.posted_date.append(_re_search( 1210 r"Posted date:\s*(.+)$", line, 1211 "I could not find the posted date in line\n%s" % line))
1212
1213 - def num_letters_in_database(self, line):
1214 letters, = _get_cols( 1215 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1216 self._dr.num_letters_in_database.append(_safe_int(letters))
1217
1218 - def num_sequences_in_database(self, line):
1219 sequences, = _get_cols( 1220 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1221 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1222
1223 - def ka_params(self, line):
1224 x = line.split() 1225 self._dr.ka_params = map(_safe_float, x)
1226
1227 - def gapped(self, line):
1228 self._dr.gapped = 1
1229
1230 - def ka_params_gap(self, line):
1231 x = line.split() 1232 self._dr.ka_params_gap = map(_safe_float, x)
1233
1234 - def end_database_report(self):
1235 pass
1236
1237 -class _ParametersConsumer:
1238 - def start_parameters(self):
1239 self._params = Record.Parameters()
1240
1241 - def matrix(self, line):
1242 self._params.matrix = line[8:].rstrip()
1243
1244 - def gap_penalties(self, line):
1245 x = _get_cols( 1246 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1247 self._params.gap_penalties = map(_safe_float, x)
1248
1249 - def num_hits(self, line):
1250 if line.find('1st pass') != -1: 1251 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1252 self._params.num_hits = _safe_int(x) 1253 else: 1254 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1255 self._params.num_hits = _safe_int(x)
1256
1257 - def num_sequences(self, line):
1258 if line.find('1st pass') != -1: 1259 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1260 self._params.num_sequences = _safe_int(x) 1261 else: 1262 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1263 self._params.num_sequences = _safe_int(x)
1264
1265 - def num_extends(self, line):
1266 if line.find('1st pass') != -1: 1267 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1268 self._params.num_extends = _safe_int(x) 1269 else: 1270 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1271 self._params.num_extends = _safe_int(x)
1272
1273 - def num_good_extends(self, line):
1274 if line.find('1st pass') != -1: 1275 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1276 self._params.num_good_extends = _safe_int(x) 1277 else: 1278 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1279 self._params.num_good_extends = _safe_int(x)
1280
1281 - def num_seqs_better_e(self, line):
1282 self._params.num_seqs_better_e, = _get_cols( 1283 line, (-1,), ncols=7, expected={2:"sequences"}) 1284 self._params.num_seqs_better_e = _safe_int( 1285 self._params.num_seqs_better_e)
1286
1287 - def hsps_no_gap(self, line):
1288 self._params.hsps_no_gap, = _get_cols( 1289 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1290 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1291
1292 - def hsps_prelim_gapped(self, line):
1293 self._params.hsps_prelim_gapped, = _get_cols( 1294 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1295 self._params.hsps_prelim_gapped = _safe_int( 1296 self._params.hsps_prelim_gapped)
1297
1298 - def hsps_prelim_gapped_attempted(self, line):
1299 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1300 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1301 self._params.hsps_prelim_gapped_attempted = _safe_int( 1302 self._params.hsps_prelim_gapped_attempted)
1303
1304 - def hsps_gapped(self, line):
1305 self._params.hsps_gapped, = _get_cols( 1306 line, (-1,), ncols=6, expected={3:"gapped"}) 1307 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1308
1309 - def query_length(self, line):
1310 self._params.query_length, = _get_cols( 1311 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1312 self._params.query_length = _safe_int(self._params.query_length)
1313
1314 - def database_length(self, line):
1315 self._params.database_length, = _get_cols( 1316 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1317 self._params.database_length = _safe_int(self._params.database_length)
1318
1319 - def effective_hsp_length(self, line):
1320 self._params.effective_hsp_length, = _get_cols( 1321 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1322 self._params.effective_hsp_length = _safe_int( 1323 self._params.effective_hsp_length)
1324
1325 - def effective_query_length(self, line):
1326 self._params.effective_query_length, = _get_cols( 1327 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1328 self._params.effective_query_length = _safe_int( 1329 self._params.effective_query_length)
1330
1331 - def effective_database_length(self, line):
1332 self._params.effective_database_length, = _get_cols( 1333 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1334 self._params.effective_database_length = _safe_int( 1335 self._params.effective_database_length)
1336
1337 - def effective_search_space(self, line):
1338 self._params.effective_search_space, = _get_cols( 1339 line, (-1,), ncols=4, expected={1:"search"}) 1340 self._params.effective_search_space = _safe_int( 1341 self._params.effective_search_space)
1342
1343 - def effective_search_space_used(self, line):
1344 self._params.effective_search_space_used, = _get_cols( 1345 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1346 self._params.effective_search_space_used = _safe_int( 1347 self._params.effective_search_space_used)
1348
1349 - def frameshift(self, line):
1350 self._params.frameshift = _get_cols( 1351 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1352
1353 - def threshold(self, line):
1354 if line[:2] == "T:" : 1355 #Assume its an old stlye line like "T: 123" 1356 self._params.threshold, = _get_cols( 1357 line, (1,), ncols=2, expected={0:"T:"}) 1358 elif line[:28] == "Neighboring words threshold:" : 1359 self._params.threshold, = _get_cols( 1360 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1361 else : 1362 raise ValueError("Unrecognised threshold line:\n%s" % line) 1363 self._params.threshold = _safe_int(self._params.threshold)
1364
1365 - def window_size(self, line):
1366 if line[:2] == "A:" : 1367 self._params.window_size, = _get_cols( 1368 line, (1,), ncols=2, expected={0:"A:"}) 1369 elif line[:25] == "Window for multiple hits:" : 1370 self._params.window_size, = _get_cols( 1371 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1372 else : 1373 raise ValueError("Unrecognised window size line:\n%s" % line) 1374 self._params.window_size = _safe_int(self._params.window_size)
1375
1376 - def dropoff_1st_pass(self, line):
1377 score, bits = _re_search( 1378 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1379 "I could not find the dropoff in line\n%s" % line) 1380 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1381
1382 - def gap_x_dropoff(self, line):
1383 score, bits = _re_search( 1384 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1385 "I could not find the gap dropoff in line\n%s" % line) 1386 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1387
1388 - def gap_x_dropoff_final(self, line):
1389 score, bits = _re_search( 1390 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1391 "I could not find the gap dropoff final in line\n%s" % line) 1392 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1393
1394 - def gap_trigger(self, line):
1395 score, bits = _re_search( 1396 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1397 "I could not find the gap trigger in line\n%s" % line) 1398 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1399
1400 - def blast_cutoff(self, line):
1401 score, bits = _re_search( 1402 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1403 "I could not find the blast cutoff in line\n%s" % line) 1404 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1405
1406 - def end_parameters(self):
1407 pass
1408 1409
1410 -class _BlastConsumer(AbstractConsumer, 1411 _HeaderConsumer, 1412 _DescriptionConsumer, 1413 _AlignmentConsumer, 1414 _HSPConsumer, 1415 _DatabaseReportConsumer, 1416 _ParametersConsumer 1417 ):
1418 # This Consumer is inherits from many other consumer classes that handle 1419 # the actual dirty work. An alternate way to do it is to create objects 1420 # of those classes and then delegate the parsing tasks to them in a 1421 # decorator-type pattern. The disadvantage of that is that the method 1422 # names will need to be resolved in this classes. However, using 1423 # a decorator will retain more control in this class (which may or 1424 # may not be a bad thing). In addition, having each sub-consumer as 1425 # its own object prevents this object's dictionary from being cluttered 1426 # with members and reduces the chance of member collisions.
1427 - def __init__(self):
1428 self.data = None
1429
1430 - def round(self, line):
1431 # Make sure nobody's trying to pass me PSI-BLAST data! 1432 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1433
1434 - def start_header(self):
1435 self.data = Record.Blast() 1436 _HeaderConsumer.start_header(self)
1437
1438 - def end_header(self):
1439 _HeaderConsumer.end_header(self) 1440 self.data.__dict__.update(self._header.__dict__)
1441
1442 - def end_descriptions(self):
1443 self.data.descriptions = self._descriptions
1444
1445 - def end_alignment(self):
1446 _AlignmentConsumer.end_alignment(self) 1447 if self._alignment.hsps: 1448 self.data.alignments.append(self._alignment) 1449 if self._multiple_alignment.alignment: 1450 self.data.multiple_alignment = self._multiple_alignment
1451
1452 - def end_hsp(self):
1453 _HSPConsumer.end_hsp(self) 1454 try: 1455 self._alignment.hsps.append(self._hsp) 1456 except AttributeError: 1457 raise ValueError("Found an HSP before an alignment")
1458
1459 - def end_database_report(self):
1460 _DatabaseReportConsumer.end_database_report(self) 1461 self.data.__dict__.update(self._dr.__dict__)
1462
1463 - def end_parameters(self):
1464 _ParametersConsumer.end_parameters(self) 1465 self.data.__dict__.update(self._params.__dict__)
1466
1467 -class _PSIBlastConsumer(AbstractConsumer, 1468 _HeaderConsumer, 1469 _DescriptionConsumer, 1470 _AlignmentConsumer, 1471 _HSPConsumer, 1472 _DatabaseReportConsumer, 1473 _ParametersConsumer 1474 ):
1475 - def __init__(self):
1476 self.data = None
1477
1478 - def start_header(self):
1479 self.data = Record.PSIBlast() 1480 _HeaderConsumer.start_header(self)
1481
1482 - def end_header(self):
1483 _HeaderConsumer.end_header(self) 1484 self.data.__dict__.update(self._header.__dict__)
1485
1486 - def start_descriptions(self):
1487 self._round = Record.Round() 1488 self.data.rounds.append(self._round) 1489 _DescriptionConsumer.start_descriptions(self)
1490
1491 - def end_descriptions(self):
1492 _DescriptionConsumer.end_descriptions(self) 1493 self._round.number = self._roundnum 1494 if self._descriptions: 1495 self._round.new_seqs.extend(self._descriptions) 1496 self._round.reused_seqs.extend(self._model_sequences) 1497 self._round.new_seqs.extend(self._nonmodel_sequences) 1498 if self._converged: 1499 self.data.converged = 1
1500
1501 - def end_alignment(self):
1502 _AlignmentConsumer.end_alignment(self) 1503 if self._alignment.hsps: 1504 self._round.alignments.append(self._alignment) 1505 if self._multiple_alignment: 1506 self._round.multiple_alignment = self._multiple_alignment
1507
1508 - def end_hsp(self):
1509 _HSPConsumer.end_hsp(self) 1510 try: 1511 self._alignment.hsps.append(self._hsp) 1512 except AttributeError: 1513 raise ValueError("Found an HSP before an alignment")
1514
1515 - def end_database_report(self):
1516 _DatabaseReportConsumer.end_database_report(self) 1517 self.data.__dict__.update(self._dr.__dict__)
1518
1519 - def end_parameters(self):
1520 _ParametersConsumer.end_parameters(self) 1521 self.data.__dict__.update(self._params.__dict__)
1522
1523 -class Iterator:
1524 """Iterates over a file of multiple BLAST results. 1525 1526 Methods: 1527 next Return the next record from the stream, or None. 1528 1529 """
1530 - def __init__(self, handle, parser=None):
1531 """__init__(self, handle, parser=None) 1532 1533 Create a new iterator. handle is a file-like object. parser 1534 is an optional Parser object to change the results into another form. 1535 If set to None, then the raw contents of the file will be returned. 1536 1537 """ 1538 try: 1539 handle.readline 1540 except AttributeError: 1541 raise ValueError( 1542 "I expected a file handle or file-like object, got %s" 1543 % type(handle)) 1544 self._uhandle = File.UndoHandle(handle) 1545 self._parser = parser
1546
1547 - def next(self):
1548 """next(self) -> object 1549 1550 Return the next Blast record from the file. If no more records, 1551 return None. 1552 1553 """ 1554 lines = [] 1555 while 1: 1556 line = self._uhandle.readline() 1557 if not line: 1558 break 1559 # If I've reached the next one, then put the line back and stop. 1560 if lines and (line.startswith('BLAST') 1561 or line.startswith('BLAST', 1) 1562 or line.startswith('<?xml ')): 1563 self._uhandle.saveline(line) 1564 break 1565 lines.append(line) 1566 1567 if not lines: 1568 return None 1569 1570 data = ''.join(lines) 1571 if self._parser is not None: 1572 return self._parser.parse(File.StringHandle(data)) 1573 return data
1574
1575 - def __iter__(self):
1576 return iter(self.next, None)
1577
1578 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1579 """Execute and retrieve data from standalone BLASTPALL as handles. 1580 1581 Execute and retrieve data from blastall. blastcmd is the command 1582 used to launch the 'blastall' executable. program is the blast program 1583 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1584 to search against. infile is the path to the file containing 1585 the sequence to search with. 1586 1587 The return values are two handles, for standard output and standard error. 1588 1589 You may pass more parameters to **keywds to change the behavior of 1590 the search. Otherwise, optional values will be chosen by blastall. 1591 The Blast output is by default in XML format. Use the align_view keyword 1592 for output in a different format. 1593 1594 Scoring 1595 matrix Matrix to use. 1596 gap_open Gap open penalty. 1597 gap_extend Gap extension penalty. 1598 nuc_match Nucleotide match reward. (BLASTN) 1599 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1600 query_genetic_code Genetic code for Query. 1601 db_genetic_code Genetic code for database. (TBLAST[NX]) 1602 1603 Algorithm 1604 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1605 expectation Expectation value cutoff. 1606 wordsize Word size. 1607 strands Query strands to search against database.([T]BLAST[NX]) 1608 keep_hits Number of best hits from a region to keep. 1609 xdrop Dropoff value (bits) for gapped alignments. 1610 hit_extend Threshold for extending hits. 1611 region_length Length of region used to judge hits. 1612 db_length Effective database length. 1613 search_length Effective length of search space. 1614 1615 Processing 1616 filter Filter query sequence for low complexity (with SEG)? T/F 1617 believe_query Believe the query defline. T/F 1618 restrict_gi Restrict search to these GI's. 1619 nprocessors Number of processors to use. 1620 oldengine Force use of old engine T/F 1621 1622 Formatting 1623 html Produce HTML output? T/F 1624 descriptions Number of one-line descriptions. 1625 alignments Number of alignments. 1626 align_view Alignment view. Integer 0-11, 1627 passed as a string or integer. 1628 show_gi Show GI's in deflines? T/F 1629 seqalign_file seqalign file to output. 1630 outfile Output file for report. Filename to write to, if 1631 ommitted standard output is used (which you can access 1632 from the returned handles). 1633 """ 1634 1635 _security_check_parameters(keywds) 1636 1637 att2param = { 1638 'matrix' : '-M', 1639 'gap_open' : '-G', 1640 'gap_extend' : '-E', 1641 'nuc_match' : '-r', 1642 'nuc_mismatch' : '-q', 1643 'query_genetic_code' : '-Q', 1644 'db_genetic_code' : '-D', 1645 1646 'gapped' : '-g', 1647 'expectation' : '-e', 1648 'wordsize' : '-W', 1649 'strands' : '-S', 1650 'keep_hits' : '-K', 1651 'xdrop' : '-X', 1652 'hit_extend' : '-f', 1653 'region_length' : '-L', 1654 'db_length' : '-z', 1655 'search_length' : '-Y', 1656 1657 'program' : '-p', 1658 'database' : '-d', 1659 'infile' : '-i', 1660 'filter' : '-F', 1661 'believe_query' : '-J', 1662 'restrict_gi' : '-l', 1663 'nprocessors' : '-a', 1664 'oldengine' : '-V', 1665 1666 'html' : '-T', 1667 'descriptions' : '-v', 1668 'alignments' : '-b', 1669 'align_view' : '-m', 1670 'show_gi' : '-I', 1671 'seqalign_file' : '-O', 1672 'outfile' : '-o', 1673 } 1674 from Applications import BlastallCommandline 1675 cline = BlastallCommandline(blastcmd) 1676 cline.set_parameter(att2param['program'], program) 1677 cline.set_parameter(att2param['database'], database) 1678 cline.set_parameter(att2param['infile'], infile) 1679 cline.set_parameter(att2param['align_view'], str(align_view)) 1680 for key, value in keywds.iteritems() : 1681 cline.set_parameter(att2param[key], str(value)) 1682 return _invoke_blast(cline)
1683 1684
1685 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1686 """Execute and retrieve data from standalone BLASTPGP as handles. 1687 1688 Execute and retrieve data from blastpgp. blastcmd is the command 1689 used to launch the 'blastpgp' executable. database is the path to the 1690 database to search against. infile is the path to the file containing 1691 the sequence to search with. 1692 1693 The return values are two handles, for standard output and standard error. 1694 1695 You may pass more parameters to **keywds to change the behavior of 1696 the search. Otherwise, optional values will be chosen by blastpgp. 1697 The Blast output is by default in XML format. Use the align_view keyword 1698 for output in a different format. 1699 1700 Scoring 1701 matrix Matrix to use. 1702 gap_open Gap open penalty. 1703 gap_extend Gap extension penalty. 1704 window_size Multiple hits window size. 1705 npasses Number of passes. 1706 passes Hits/passes. Integer 0-2. 1707 1708 Algorithm 1709 gapped Whether to do a gapped alignment. T/F 1710 expectation Expectation value cutoff. 1711 wordsize Word size. 1712 keep_hits Number of beset hits from a region to keep. 1713 xdrop Dropoff value (bits) for gapped alignments. 1714 hit_extend Threshold for extending hits. 1715 region_length Length of region used to judge hits. 1716 db_length Effective database length. 1717 search_length Effective length of search space. 1718 nbits_gapping Number of bits to trigger gapping. 1719 pseudocounts Pseudocounts constants for multiple passes. 1720 xdrop_final X dropoff for final gapped alignment. 1721 xdrop_extension Dropoff for blast extensions. 1722 model_threshold E-value threshold to include in multipass model. 1723 required_start Start of required region in query. 1724 required_end End of required region in query. 1725 1726 Processing 1727 XXX should document default values 1728 program The blast program to use. (PHI-BLAST) 1729 filter Filter query sequence for low complexity (with SEG)? T/F 1730 believe_query Believe the query defline? T/F 1731 nprocessors Number of processors to use. 1732 1733 Formatting 1734 html Produce HTML output? T/F 1735 descriptions Number of one-line descriptions. 1736 alignments Number of alignments. 1737 align_view Alignment view. Integer 0-11, 1738 passed as a string or integer. 1739 show_gi Show GI's in deflines? T/F 1740 seqalign_file seqalign file to output. 1741 align_outfile Output file for alignment. 1742 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1743 restart_infile Input file for PSI-BLAST restart. 1744 hit_infile Hit file for PHI-BLAST. 1745 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1746 align_outfile Output file for alignment. Filename to write to, if 1747 ommitted standard output is used (which you can access 1748 from the returned handles). 1749 1750 align_infile Input alignment file for PSI-BLAST restart. 1751 1752 """ 1753 1754 _security_check_parameters(keywds) 1755 1756 att2param = { 1757 'matrix' : '-M', 1758 'gap_open' : '-G', 1759 'gap_extend' : '-E', 1760 'window_size' : '-A', 1761 'npasses' : '-j', 1762 'passes' : '-P', 1763 1764 'gapped' : '-g', 1765 'expectation' : '-e', 1766 'wordsize' : '-W', 1767 'keep_hits' : '-K', 1768 'xdrop' : '-X', 1769 'hit_extend' : '-f', 1770 'region_length' : '-L', 1771 'db_length' : '-Z', 1772 'search_length' : '-Y', 1773 'nbits_gapping' : '-N', 1774 'pseudocounts' : '-c', 1775 'xdrop_final' : '-Z', 1776 'xdrop_extension' : '-y', 1777 'model_threshold' : '-h', 1778 'required_start' : '-S', 1779 'required_end' : '-H', 1780 1781 'program' : '-p', 1782 'database' : '-d', 1783 'infile' : '-i', 1784 'filter' : '-F', 1785 'believe_query' : '-J', 1786 'nprocessors' : '-a', 1787 1788 'html' : '-T', 1789 'descriptions' : '-v', 1790 'alignments' : '-b', 1791 'align_view' : '-m', 1792 'show_gi' : '-I', 1793 'seqalign_file' : '-O', 1794 'align_outfile' : '-o', 1795 'checkpoint_outfile' : '-C', 1796 'restart_infile' : '-R', 1797 'hit_infile' : '-k', 1798 'matrix_outfile' : '-Q', 1799 'align_infile' : '-B', 1800 } 1801 from Applications import BlastpgpCommandline 1802 cline = BlastpgpCommandline(blastcmd) 1803 cline.set_parameter(att2param['database'], database) 1804 cline.set_parameter(att2param['infile'], infile) 1805 cline.set_parameter(att2param['align_view'], str(align_view)) 1806 for key, value in keywds.iteritems() : 1807 cline.set_parameter(att2param[key], str(value)) 1808 return _invoke_blast(cline)
1809 1810
1811 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1812 """Execute and retrieve data from standalone RPS-BLAST as handles. 1813 1814 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1815 command used to launch the 'rpsblast' executable. database is the path 1816 to the database to search against. infile is the path to the file 1817 containing the sequence to search with. 1818 1819 The return values are two handles, for standard output and standard error. 1820 1821 You may pass more parameters to **keywds to change the behavior of 1822 the search. Otherwise, optional values will be chosen by rpsblast. 1823 1824 Please note that this function will give XML output by default, by 1825 setting align_view to seven (i.e. command line option -m 7). 1826 You should use the NCBIXML.parse() function to read the resulting output. 1827 This is because NCBIStandalone.BlastParser() does not understand the 1828 plain text output format from rpsblast. 1829 1830 WARNING - The following text and associated parameter handling has not 1831 received extensive testing. Please report any errors we might have made... 1832 1833 Algorithm/Scoring 1834 gapped Whether to do a gapped alignment. T/F 1835 multihit 0 for multiple hit (default), 1 for single hit 1836 expectation Expectation value cutoff. 1837 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1838 0 in 'start' refers to the beginning of the sequence 1839 0 in 'stop' refers to the end of the sequence 1840 Default = 0,0 1841 xdrop Dropoff value (bits) for gapped alignments. 1842 xdrop_final X dropoff for final gapped alignment (in bits). 1843 xdrop_extension Dropoff for blast extensions (in bits). 1844 search_length Effective length of search space. 1845 nbits_gapping Number of bits to trigger gapping. 1846 protein Query sequence is protein. T/F 1847 db_length Effective database length. 1848 1849 Processing 1850 filter Filter query sequence for low complexity? T/F 1851 case_filter Use lower case filtering of FASTA sequence T/F, default F 1852 believe_query Believe the query defline. T/F 1853 nprocessors Number of processors to use. 1854 logfile Name of log file to use, default rpsblast.log 1855 1856 Formatting 1857 html Produce HTML output? T/F 1858 descriptions Number of one-line descriptions. 1859 alignments Number of alignments. 1860 align_view Alignment view. Integer 0-11, 1861 passed as a string or integer. 1862 show_gi Show GI's in deflines? T/F 1863 seqalign_file seqalign file to output. 1864 align_outfile Output file for alignment. Filename to write to, if 1865 ommitted standard output is used (which you can access 1866 from the returned handles). 1867 """ 1868 1869 _security_check_parameters(keywds) 1870 1871 att2param = { 1872 'multihit' : '-P', 1873 'gapped' : '-g', 1874 'expectation' : '-e', 1875 'range_restriction' : '-L', 1876 'xdrop' : '-X', 1877 'xdrop_final' : '-Z', 1878 'xdrop_extension' : '-y', 1879 'search_length' : '-Y', 1880 'nbits_gapping' : '-N', 1881 'protein' : '-p', 1882 'db_length' : '-z', 1883 1884 'database' : '-d', 1885 'infile' : '-i', 1886 'filter' : '-F', 1887 'case_filter' : '-U', 1888 'believe_query' : '-J', 1889 'nprocessors' : '-a', 1890 'logfile' : '-l', 1891 1892 'html' : '-T', 1893 'descriptions' : '-v', 1894 'alignments' : '-b', 1895 'align_view' : '-m', 1896 'show_gi' : '-I', 1897 'seqalign_file' : '-O', 1898 'align_outfile' : '-o', 1899 } 1900 1901 from Applications import RpsBlastCommandline 1902 cline = RpsBlastCommandline(blastcmd) 1903 cline.set_parameter(att2param['database'], database) 1904 cline.set_parameter(att2param['infile'], infile) 1905 cline.set_parameter(att2param['align_view'], str(align_view)) 1906 for key, value in keywds.iteritems() : 1907 cline.set_parameter(att2param[key], str(value)) 1908 return _invoke_blast(cline)
1909 1910
1911 -def _re_search(regex, line, error_msg):
1912 m = re.search(regex, line) 1913 if not m: 1914 raise ValueError(error_msg) 1915 return m.groups()
1916
1917 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1918 cols = line.split() 1919 1920 # Check to make sure number of columns is correct 1921 if ncols is not None and len(cols) != ncols: 1922 raise ValueError("I expected %d columns (got %d) in line\n%s" \ 1923 % (ncols, len(cols), line)) 1924 1925 # Check to make sure columns contain the correct data 1926 for k in expected.keys(): 1927 if cols[k] != expected[k]: 1928 raise ValueError("I expected '%s' in column %d in line\n%s" \ 1929 % (expected[k], k, line)) 1930 1931 # Construct the answer tuple 1932 results = [] 1933 for c in cols_to_get: 1934 results.append(cols[c]) 1935 return tuple(results)
1936
1937 -def _safe_int(str):
1938 try: 1939 return int(str) 1940 except ValueError: 1941 # Something went wrong. Try to clean up the string. 1942 # Remove all commas from the string 1943 str = str.replace(',', '') 1944 try: 1945 # try again. 1946 return int(str) 1947 except ValueError: 1948 pass 1949 # If it fails again, maybe it's too long? 1950 # XXX why converting to float? 1951 return long(float(str))
1952
1953 -def _safe_float(str):
1954 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1955 # float('e-172') does not produce an error on his platform. Thus, 1956 # we need to check the string for this condition. 1957 1958 # Sometimes BLAST leaves of the '1' in front of an exponent. 1959 if str and str[0] in ['E', 'e']: 1960 str = '1' + str 1961 try: 1962 return float(str) 1963 except ValueError: 1964 # Remove all commas from the string 1965 str = str.replace(',', '') 1966 # try again. 1967 return float(str)
1968 1969
1970 -def _invoke_blast(cline) :
1971 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 1972 1973 Expects a command line wrapper object from Bio.Blast.Applications 1974 """ 1975 import subprocess, sys 1976 blast_cmd = cline.program_name 1977 if not os.path.exists(blast_cmd): 1978 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 1979 #We don't need to supply any piped input, but we setup the 1980 #standard input pipe anyway as a work around for a python 1981 #bug if this is called from a Windows GUI program. For 1982 #details, see http://bugs.python.org/issue1124861 1983 blast_process = subprocess.Popen(str(cline), 1984 stdin=subprocess.PIPE, 1985 stdout=subprocess.PIPE, 1986 stderr=subprocess.PIPE, 1987 shell=(sys.platform!="win32")) 1988 blast_process.stdin.close() 1989 return blast_process.stdout, blast_process.stderr
1990 1991
1992 -def _security_check_parameters(param_dict) :
1993 """Look for any attempt to insert a command into a parameter. 1994 1995 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 1996 1997 Looks for ";" or "&&" in the strings (Unix and Windows syntax 1998 for appending a command line), or ">", "<" or "|" (redirection) 1999 and if any are found raises an exception. 2000 """ 2001 for key, value in param_dict.iteritems() : 2002 str_value = str(value) # Could easily be an int or a float 2003 for bad_str in [";", "&&", ">", "<", "|"] : 2004 if bad_str in str_value : 2005 raise ValueError("Rejecting suspicious argument for %s" % key)
2006
2007 -class _BlastErrorConsumer(_BlastConsumer):
2008 - def __init__(self):
2010 - def noevent(self, line):
2011 if line.find("Query must be at least wordsize") != -1: 2012 raise ShortQueryBlastError("Query must be at least wordsize") 2013 # Now pass the line back up to the superclass. 2014 method = getattr(_BlastConsumer, 'noevent', 2015 _BlastConsumer.__getattr__(self, 'noevent')) 2016 method(line)
2017
2018 -class BlastErrorParser(AbstractParser):
2019 """Attempt to catch and diagnose BLAST errors while parsing. 2020 2021 This utilizes the BlastParser module but adds an additional layer 2022 of complexity on top of it by attempting to diagnose ValueErrors 2023 that may actually indicate problems during BLAST parsing. 2024 2025 Current BLAST problems this detects are: 2026 o LowQualityBlastError - When BLASTing really low quality sequences 2027 (ie. some GenBank entries which are just short streches of a single 2028 nucleotide), BLAST will report an error with the sequence and be 2029 unable to search with this. This will lead to a badly formatted 2030 BLAST report that the parsers choke on. The parser will convert the 2031 ValueError to a LowQualityBlastError and attempt to provide useful 2032 information. 2033 2034 """
2035 - def __init__(self, bad_report_handle = None):
2036 """Initialize a parser that tries to catch BlastErrors. 2037 2038 Arguments: 2039 o bad_report_handle - An optional argument specifying a handle 2040 where bad reports should be sent. This would allow you to save 2041 all of the bad reports to a file, for instance. If no handle 2042 is specified, the bad reports will not be saved. 2043 """ 2044 self._bad_report_handle = bad_report_handle 2045 2046 #self._b_parser = BlastParser() 2047 self._scanner = _Scanner() 2048 self._consumer = _BlastErrorConsumer()
2049
2050 - def parse(self, handle):
2051 """Parse a handle, attempting to diagnose errors. 2052 """ 2053 results = handle.read() 2054 2055 try: 2056 self._scanner.feed(File.StringHandle(results), self._consumer) 2057 except ValueError, msg: 2058 # if we have a bad_report_file, save the info to it first 2059 if self._bad_report_handle: 2060 # send the info to the error handle 2061 self._bad_report_handle.write(results) 2062 2063 # now we want to try and diagnose the error 2064 self._diagnose_error( 2065 File.StringHandle(results), self._consumer.data) 2066 2067 # if we got here we can't figure out the problem 2068 # so we should pass along the syntax error we got 2069 raise 2070 return self._consumer.data
2071
2072 - def _diagnose_error(self, handle, data_record):
2073 """Attempt to diagnose an error in the passed handle. 2074 2075 Arguments: 2076 o handle - The handle potentially containing the error 2077 o data_record - The data record partially created by the consumer. 2078 """ 2079 line = handle.readline() 2080 2081 while line: 2082 # 'Searchingdone' instead of 'Searching......done' seems 2083 # to indicate a failure to perform the BLAST due to 2084 # low quality sequence 2085 if line.startswith('Searchingdone'): 2086 raise LowQualityBlastError("Blast failure occured on query: ", 2087 data_record.query) 2088 line = handle.readline()
2089