Package Bio :: Package expressions :: Package blast :: Module ncbiblast
[hide private]
[frames] | no frames]

Source Code for Module Bio.expressions.blast.ncbiblast

  1  import warnings 
  2  warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
  3   
  4   
  5   
  6  from Martel import * 
  7  from Bio import Std 
  8   
  9  spaces_line = Opt(Spaces()) + AnyEol() 
 10   
 11  blast_filter = Str("BLASTN", "BLASTP", "BLASTX", "TBLASTX", "TBLASTN") 
 12   
 13  ncbi_version = Std.application_version(Group("appversion", Re(r"2.\d+\.\d+"))) 
 14   
 15  # BLASTN 2.0.11 [Jan-20-2000] 
 16  blastn_appheader = (Std.application_name(Str("BLASTN"), {"app": "blastn"}) +  
 17                      Str(" ") + 
 18                      ncbi_version + 
 19                      Str(" ") + 
 20                      ToEol()) 
 21   
 22   
 23  # BLASTP 2.0.11 [Jan-20-2000] 
 24  blastp_appheader = (Std.application_name(Str("BLASTP"), {"app": "blastp"}) +  
 25                      Str(" ") + 
 26                      ncbi_version + 
 27                      Str(" ") + 
 28                      ToEol()) 
 29   
 30  # BLASTX 2.0.11 [Jan-20-2000] 
 31  blastx_appheader = (Std.application_name(Str("BLASTX"), {"app": "blastx"}) +  
 32                      Str(" ") + 
 33                      ncbi_version + 
 34                      Str(" ") + 
 35                      ToEol()) 
 36   
 37  # TBLASTN 2.0.11 [Jan-20-2000] 
 38  tblastn_appheader = (Std.application_name(Str("TBLASTN"), {"app": "tblastn"}) + 
 39                       Str(" ") + 
 40                       ncbi_version + 
 41                       Str(" ") + 
 42                       ToEol()) 
 43   
 44  # TBLASTX 2.0.11 [Jan-20-2000] 
 45  tblastx_appheader = (Std.application_name(Str("TBLASTX"), {"app": "tblastx"}) + 
 46                       Str(" ") + 
 47                       ncbi_version + 
 48                       Str(" ") + 
 49                       ToEol()) 
 50   
 51  # This is tricky because the description can include a statement like 
 52  #  '(492 letters)' at just the right place to confuse things. 
 53  query_letters = (Spaces() + 
 54                   Str("(") + 
 55                   Std.query_size(Digits()) + 
 56                   Str(" letters)") + 
 57                   AnyEol() + 
 58                   Assert(AnyEol()) 
 59                   ) 
 60   
 61  query_descr = (Str("Query=") + 
 62                 Opt(Spaces()) + 
 63                 Std.query_description(UntilEol()) + 
 64                 AnyEol() + 
 65                 Rep(AssertNot(query_letters) + 
 66                     Std.query_description(UntilEol()) + 
 67                     AnyEol()) 
 68                 ) 
 69   
 70  query_descr = Std.query_description_block(query_descr) 
 71   
 72  # This requires there to be at least one comma 
 73  _comma_digits = Re(r"\d(\d\d?)?(,\d\d\d)+") 
74 -def Number(name = None, attrs = {}):
75 if name is None: 76 assert not attrs 77 return _comma_digits | Digits() 78 return Group(name, _comma_digits, attrs) | \ 79 Digits(name, attrs)
80 81 82 query_database = (Str("Database:") + Opt(Spaces()) + 83 Std.database_name(UntilEol()) + AnyEol() + 84 Spaces() + 85 Std.database_num_sequences(Number(), 86 {"bioformat:decode": "int.comma"}) + 87 Str(" sequences;") + Spaces() + 88 Std.database_num_letters(Number(), 89 {"bioformat:decode": "int.comma"}) + 90 Spaces() + Str("total letters") + ToEol()) 91 92 93 # Score E 94 # Sequences producing significant alignments: (bits) Value 95 # 96 # U51677 Human non-histone chromatin protein HMG1 (HMG1) gene, co... 4129 0.0 97 # L38477 Mus musculus (clone Clebp-1) high mobility group 1 prote... 353 7e-95 98 99 to_table = Group("to_table", 100 (SkipLinesUntil(Str("Sequences producing significant")) + 101 ToEol() + 102 AnyEol())) 103 104 table_entry = Std.search_table_entry( 105 Group("table_entry", 106 (AssertNot(AnyEol()) + 107 Std.search_table_description(Re(r".{66}")) + Spaces() + 108 Std.search_table_value(Float(), {"name": "score", 109 "bioformat:decode": "float"}) + 110 Spaces() + 111 Std.search_table_value(Float(), {"name": "evalue", 112 "bioformat:decode": "float"}) + 113 AnyEol()) 114 ) 115 ) 116 117 table = Std.search_table(Rep(table_entry)) 118 119 120 header = Std.search_header( 121 SkipLinesUntil(Str("Query=")) + 122 query_descr + 123 query_letters + 124 AnyEol() + 125 Group("to_database", NullOp()) + # Needed for wu-blastx 126 query_database + 127 Opt(to_table + table) + 128 SkipLinesUntil(Str(">")) 129 ) 130 131 132 # >U51677 Human non-histone chromatin protein HMG1 (HMG1) gene, complete 133 # cds. 134 # Length = 2575 135 # 136 # Score = 4129 bits (2083), Expect = 0.0 137 # Identities = 2167/2209 (98%) 138 # Strand = Plus / Plus 139 140 # This is tricky since it's (theoretical) possible the description 141 # could have a "Length =" at exactly the right spot to confuse things. 142 # So stop before getting to the line with a 'Length =' where the 143 # line afterwards is blank. 144 hit_length = (Spaces() + Str("Length = ") + 145 Std.hit_length(Digits()) + AnyEol() + 146 Opt(Spaces()) + AnyEol()) 147 148 hit_descr = (Str(">") + Std.hit_description(UntilEol()) + AnyEol() + 149 Rep(AssertNot(hit_length) + 150 Std.hit_description(UntilEol()) + AnyEol()) 151 ) 152 153 154 num_bits = Std.hsp_value(Float(), {"name": "bits", 155 "bioformat:decode": "float", 156 }) 157 158 expect = Std.hsp_value(Float(), {"name": "expect", 159 "bioformat:decode": "float"}) 160 161 num_identical = Std.hsp_value(Digits(), {"name": "identical", 162 "bioformat:decode": "int"}) 163 164 hsp_length = Std.hsp_value(Digits(), {"name": "length", 165 "bioformat:decode": "int"}) 166 167 num_positives = Std.hsp_value(Digits(), {"name": "positives", 168 "bioformat:decode": "int"}) 169 170 # What's after the bits? 171 score = (Re(r" *Score = *") + num_bits + 172 Re(r" bits \((?P<XXX1>\d+)\), Expect = *") + 173 expect + AnyEol()) 174 175 # XXX What's the expect_num mean? 176 tblastn_score = (Re(r" *Score = *") + num_bits + 177 Re(r" bits \((?P<XXX1>\d+)\), " 178 r"Expect(\((?P<expect_num>\d+)\))? = *") + 179 expect + AnyEol()) 180 181 blastn_identical = (Re(r" *Identities = *") + 182 num_identical + 183 Str("/") + 184 hsp_length + 185 ToEol()) 186 187 # Identities = 154/168 (91%), Positives = 154/168 (91%) 188 blastp_identical = (Re(r" *Identities = *") + 189 num_identical + 190 Str("/") + 191 hsp_length + 192 ToSep(sep = ",") + 193 Re(" Positives = *") + 194 num_positives + 195 ToEol()) 196 197 # Frame = +1 198 # can be +1, +2, +3, -1, -2, -3 199 frame = (Str(" Frame = ") + Std.hsp_frame(UntilEol(), {"which": "query"}) + 200 AnyEol()) 201 tblastx_frame = (Str(" Frame = ") + 202 Std.hsp_frame(UntilSep(sep = " "), {"which": "query"}) + 203 Str(" / ") + 204 Std.hsp_frame(UntilEol(), {"which": "subject"}) + 205 AnyEol()) 206 207 # Strand = Plus / Plus 208 strand = (Re(r" *Strand = ") + 209 (Std.hsp_strand(Str("Plus"), {"strand": "+1", "which": "query"}) | 210 Std.hsp_strand(Str("Minus"), {"strand": "-1", "which": "query"})) + 211 Str(" / ") + 212 (Std.hsp_strand(Str("Plus"), {"strand": "+1", "which": "subject"}) | 213 Std.hsp_strand(Str("Minus"), {"strand": "-1", "which": "subject"}))+ 214 AnyEol()) 215 216 query_line = ( 217 Std.hsp_seqalign_query_leader( 218 Std.hsp_seqalign_query_name(Str("Query")) + 219 Re(": *") + 220 Std.hsp_seqalign_query_start(Digits()) + 221 Re(" *") 222 ) + 223 Std.hsp_seqalign_query_seq(UntilSep(sep = " ")) + 224 Re(" *") + 225 Std.hsp_seqalign_query_end(Digits()) + 226 AnyEol() 227 ) 228 229 homology_line = ( 230 Std.hsp_seqalign_homology_seq( 231 Str(" ") + # The blanks are for safety; to 232 UntilEol() # ensure we don't blindly read a line 233 ) + 234 AnyEol() 235 ) 236 237 subject_line = ( 238 Std.hsp_seqalign_subject_name(Str("Sbjct")) + 239 Re(": *") + 240 Std.hsp_seqalign_subject_start(Digits()) + 241 Re(" *") + 242 Std.hsp_seqalign_subject_seq(UntilSep(sep = " ")) + 243 Re(" *") + 244 Std.hsp_seqalign_subject_end(Digits()) + 245 AnyEol() 246 ) 247 248 249 blastn_hsp_header = (score + blastn_identical + strand + 250 spaces_line + spaces_line) 251 252 blastp_hsp_header = score + blastp_identical + spaces_line 253 254 blastx_hsp_header = score + blastp_identical + frame + spaces_line 255 256 tblastn_hsp_header = tblastn_score + blastp_identical + frame + spaces_line 257 258 tblastx_hsp_header = (tblastn_score + blastp_identical + tblastx_frame + 259 spaces_line + spaces_line) 260 261 262 alignment = Std.hsp_seqalign(query_line + homology_line + subject_line + 263 Rep1(spaces_line)) 264 265 hsp = Std.hsp(Group("hsp_header", NullOp()) + 266 Rep(alignment)) 267 268 hit = Std.hit(hit_descr + hit_length + 269 Rep(Group("hsp", hsp)) 270 ) 271 272 # Already know the database 273 274 # parse the name: value lines
275 -def _nv(s, expr):
276 return Str(s) + expr + AnyEol()
277 -def float_stat(name):
278 return Std.search_statistic(Float(), {"name": name, 279 "bioformat:decode": "float"})
280 -def _num_stat(name, signed, comma):
281 d = {"name": name, 282 "bioformat:decode": "int"} 283 if signed: 284 exp = Integer() 285 elif comma: 286 exp = Number() 287 d["bioformat:decode"] = "int.comma" 288 else: 289 exp = Digits() 290 return exp, d
291
292 -def int_stat(name, signed = 0, comma = 0):
293 exp, d = _num_stat(name, signed, comma) 294 return Std.search_statistic(exp, d)
295
296 -def int_param(name, signed = 0, comma = 0):
297 exp, d = _num_stat(name, signed, comma) 298 return Std.search_parameter(exp, d)
299
300 -def float_param(name):
301 return Std.search_parameter(Float(), {"name": name, 302 "bioformat:decode": "float"})
303 304 305 db_stats = (Str(" Database:") + ToEol() + 306 Str(" Posted date:") + ToEol() + 307 Str(" Number of letters") + ToEol() + 308 Str(" Number of sequences") + ToEol() + 309 spaces_line) 310 311 ungapped_lambda_stats = ( 312 Str("Lambda K H") + AnyEol() + 313 ( Spaces() + float_stat("lambda") + 314 Spaces() + float_stat("k") + 315 Spaces() + float_stat("h") + 316 ToEol() ) 317 ) 318 gapped_lambda_stats = ( 319 Opt(Str("Gapped") + AnyEol()) + 320 Str("Lambda K H") + AnyEol() + 321 ( Spaces() + float_stat("gapped_lambda") + 322 Spaces() + float_stat("gapped_k") + 323 Spaces() + float_stat("gapped_h") + 324 ToEol() ) 325 ) 326 lambda_stats = (ungapped_lambda_stats + 327 AnyEol() + 328 gapped_lambda_stats) 329 330 matrix_stats = ( 331 SkipLinesUntil(Str("Matrix:")) + 332 # Matrix: blastn matrix:1 -3 333 _nv("Matrix: ", Std.search_statistic(UntilEol(), {"name": "matrix"})) 334 ) 335 336 gap_penalties_stats = ( 337 # Gap Penalties: Existence: 5, Extension: 2 338 _nv("Gap Penalties: Existence: ", 339 int_param("existence_gap_penalty") + 340 Str(", Extension: ") + 341 int_param("extension_gap_penalty")) 342 ) 343 344 generic_info1 = ( 345 # Number of Hits to DB: 1123218 346 # !! I've seen negative numbers here !! 347 # Number of Hits to DB: -331804600 348 _nv("Number of Hits to DB: ", int_stat("num_hits_to_db", signed = 1)) + 349 350 # Number of Sequences: 442729 351 _nv("Number of Sequences: ", int_stat("num_sequences")) + 352 353 # Number of extensions: 1123218 354 _nv("Number of extensions: ", int_stat("num_extensions")) + 355 356 # Number of successful extensions: 86816 357 _nv("Number of successful extensions: ", 358 int_stat("num_successful_extensions")) + 359 360 # Number of sequences better than 10.0: 106 361 _nv("Number of sequences better than 10.0: ", 362 int_stat("num_sequences_better_than_10")) 363 ) 364 365 generic_info2 = ( 366 # length of query: 2575 367 _nv("length of query: ", int_stat("query_length")) + 368 369 # length of database: 675,252,082 370 _nv("length of database: ", int_stat("database_length", comma = 1)) + 371 372 # effective HSP length: 21 373 _nv("effective HSP length: ", int_stat("effective_hsp_length"))+ 374 375 # effective length of query: 2554 376 _nv("effective length of query: ", int_stat("effective_query_length")) + 377 378 # effective length of database: 665,954,773 379 _nv("effective length of database: ", 380 int_stat("effective_database_length", comma = 1)) + 381 382 # effective search space: 1700848490242 383 _nv("effective search space: ", int_stat("effective_search_space")) + 384 385 # effective search space used: 1700848490242 386 _nv("effective search space used: ", 387 int_stat("effective_search_space_used")) 388 ) 389 390 hsp_info = ( 391 # Number of HSP's better than 10.0 without gapping: 136 392 _nv("Number of HSP's better than 10.0 without gapping: ", 393 int_stat("num_hsps_better_than_10_without_gapping")) + 394 395 # Number of HSP's successfully gapped in prelim test: 14 396 _nv("Number of HSP's successfully gapped in prelim test: ", 397 int_stat("num_hsps_successfully_gapped_in_prelim_test")) + 398 399 # Number of HSP's that attempted gapping in prelim test: 880 400 _nv("Number of HSP's that attempted gapping in prelim test: ", 401 int_stat("num_hsps_attempted_gapping_in_prelim_test")) + 402 403 # Number of HSP's gapped (non-prelim): 291 404 _nv("Number of HSP's gapped (non-prelim): ", 405 int_stat("num_hsps_gapped")) 406 ) 407 408 # frameshift window, decay const: 50, 0.1 409 frameshift_info = _nv("frameshift window, decay const: ", 410 int_stat("frameshift_window") + 411 Re(r", *") + 412 float_stat("frameshift_decat_const")) 413
414 -def _bit_info(s, name):
415 return _nv(s, 416 int_stat(name) + 417 Spaces() + Str("(") + Opt(Spaces()) + 418 float_stat(name + "_bits") + 419 Str(" bits)"))
420 421 # T: 0 422 t_info = _nv("T: ", int_stat("t")) 423 424 # A: 0 425 a_info = _nv("A: ", int_stat("1")) 426 427 # X1: 6 (11.9 bits) 428 x1_info = _bit_info("X1: ", "x1") 429 430 # X2: 10 (19.8 bits) 431 x2_info = _bit_info("X2: ", "x2") 432 433 # X3: 64 (24.9 bits) 434 x3_info = _bit_info("X3: ", "x3") 435 436 # S1: 12 (24.3 bits) 437 s1_info = _bit_info("S1: ", "s1") 438 439 # S2: 19 (38.2 bits) 440 s2_info = _bit_info("S2: ", "s2") 441 442 443 blastn_ending = (db_stats + 444 lambda_stats + 445 matrix_stats + 446 gap_penalties_stats + 447 generic_info1 + 448 Opt(hsp_info) + 449 generic_info2 + 450 t_info + 451 a_info + 452 x1_info + 453 x2_info + 454 s1_info + 455 s2_info 456 ) 457 458 blastp_ending = (db_stats + 459 lambda_stats + 460 matrix_stats + 461 gap_penalties_stats + 462 generic_info1 + 463 hsp_info + 464 generic_info2 + 465 t_info + 466 a_info + 467 x1_info + 468 x2_info + 469 x3_info + 470 s1_info + 471 s2_info 472 ) 473 474 blastx_ending = (db_stats + 475 lambda_stats + 476 matrix_stats + 477 gap_penalties_stats + 478 generic_info1 + 479 hsp_info + 480 generic_info2 + 481 frameshift_info + 482 t_info + 483 a_info + 484 x1_info + 485 x2_info + 486 x3_info + 487 s1_info + 488 s2_info 489 ) 490 tblastn_ending = blastx_ending 491 tblastx_ending = (db_stats + 492 ungapped_lambda_stats + 493 matrix_stats + 494 generic_info1 + 495 generic_info2 + 496 frameshift_info + 497 t_info + 498 a_info + 499 x1_info + 500 x2_info + 501 s1_info + 502 s2_info 503 ) 504 505 format = Std.record(Group("appheader", NullOp()) + 506 header + 507 Rep(hit) + 508 Group("ending", NullOp())) 509 510 blastn = replace_groups(format, 511 (("appheader", blastn_appheader), 512 ("hsp_header", blastn_hsp_header), 513 ("ending", blastn_ending))) 514 515 blastp = replace_groups(format, 516 (("appheader", blastp_appheader), 517 ("hsp_header", blastp_hsp_header), 518 ("ending", blastp_ending))) 519 520 blastx = replace_groups(format, 521 (("appheader", blastx_appheader), 522 ("hsp_header", blastx_hsp_header), 523 ("ending", blastx_ending))) 524 525 tblastn = replace_groups(format, 526 (("appheader", tblastn_appheader), 527 ("hsp_header", tblastn_hsp_header), 528 ("ending", tblastn_ending))) 529 530 tblastx = replace_groups(format, 531 (("appheader", tblastx_appheader), 532 ("hsp_header", tblastx_hsp_header), 533 ("ending", tblastx_ending))) 534