Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  refered to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from Bio.Align.Generic import Alignment 
 24  from Interfaces import AlignmentIterator 
 25  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 26  from Bio.Alphabet import Gapped 
 27   
 28   
29 -class FastaM10Iterator(AlignmentIterator) :
30 """Alignment iterator for the FASTA tool's pairwise alignment output. 31 32 This is for reading the pairwise alignments output by Bill Pearson's 33 FASTA program when called with the -m 10 command line option for machine 34 readable output. For more details about the FASTA tools, see the website 35 http://fasta.bioch.virginia.edu/ and the paper: 36 37 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 38 39 This class is intended to be used via the Bio.AlignIO.parse() function 40 by specifying the format as "fasta-m10" as shown in the following code: 41 42 from Bio import AlignIO 43 handle = ... 44 for a in AlignIO.parse(handle, "fasta-m10") : 45 assert len(a.get_all_seqs()) == 2, "Should be pairwise!" 46 print "Alignment length %i" % a.get_alignment_length() 47 for record in a : 48 print record.seq, record.name, record.id 49 50 Note that this is not a full blown parser for all the information 51 in the FASTA output - for example, most of the header and all of the 52 footer is ignored. Also, the alignments are not batched according to 53 the input queries. 54 55 Also note that there can be up to about 30 letters of flanking region 56 included in the raw FASTA output as contextual information. This is NOT 57 part of the alignment itself, and is not included in the resulting 58 Alignment objects returned. 59 """ 60
61 - def next(self) :
62 """Reads from the handle to construct and return the next alignment. 63 64 This returns the pairwise alignment of query and match/library 65 sequences as an Alignment object containing two rows.""" 66 handle = self.handle 67 68 try : 69 #Header we saved from when we were parsing 70 #the previous alignment. 71 line = self._header 72 del self._header 73 except AttributeError: 74 line = handle.readline() 75 if not line: 76 return None 77 78 if line.startswith("#") : 79 #Skip the file header before the alignments. e.g. 80 line = self._skip_file_header(line) 81 while ">>>" in line and not line.startswith(">>>") : 82 #Moved onto the next query sequence! 83 self._query_descr = "" 84 self._query_header_annotation = {} 85 #Read in the query header 86 line = self._parse_query_header(line) 87 #Now should be some alignments, but if not we move onto the next query 88 if not line : 89 #End of file 90 return None 91 if ">>><<<" in line : 92 #Reached the end of the alignments, no need to read the footer... 93 return None 94 95 96 #Should start >>... and not >>>... 97 assert line[0:2] == ">>" and not line[2] == ">", line 98 99 query_seq_parts, match_seq_parts = [], [] 100 query_annotation, match_annotation = {}, {} 101 match_descr = "" 102 alignment_annotation = {} 103 104 #This should be followed by the target match ID line, then more tags. 105 #e.g. 106 """ 107 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 108 ; fa_frame: f 109 ; fa_initn: 52 110 ; fa_init1: 52 111 ; fa_opt: 70 112 ; fa_z-score: 105.5 113 ; fa_bits: 27.5 114 ; fa_expect: 0.082 115 ; sw_score: 70 116 ; sw_ident: 0.279 117 ; sw_sim: 0.651 118 ; sw_overlap: 43 119 """ 120 if (not line[0:2] == ">>") or line[0:3] == ">>>" : 121 raise ValueError("Expected target line starting '>>'") 122 match_descr = line[2:].strip() 123 #Handle the following "alignment hit" tagged data, e.g. 124 line = handle.readline() 125 line = self._parse_tag_section(line, alignment_annotation) 126 assert not line[0:2] == "; " 127 128 #Then we have the alignment numbers and sequence for the query 129 """ 130 >gi|10955265| .. 131 ; sq_len: 346 132 ; sq_offset: 1 133 ; sq_type: p 134 ; al_start: 197 135 ; al_stop: 238 136 ; al_display_start: 167 137 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 138 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 139 GEYFTENKPKYIIREIHQET 140 """ 141 if not (line[0] == ">" and line.strip().endswith("..")): 142 raise ValueError("Expected line starting '>' and ending '..'") 143 assert self._query_descr.startswith(line[1:].split(None,1)[0]) 144 145 #Handle the following "query alignment" tagged data 146 line = handle.readline() 147 line = self._parse_tag_section(line, query_annotation) 148 assert not line[0:2] == "; " 149 150 #Now should have the aligned query sequence (with leading flanking region) 151 while not line[0] == ">" : 152 query_seq_parts.append(line.strip()) 153 line = handle.readline() 154 155 #Handle the following "match alignment" data 156 """ 157 >gi|152973545|ref|YP_001338596.1| .. 158 ; sq_len: 242 159 ; sq_type: p 160 ; al_start: 52 161 ; al_stop: 94 162 ; al_display_start: 22 163 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 164 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 165 QDFAFTRKMRREARQVEQSW 166 """ 167 #Match identifier 168 if not (line[0] == ">" and line.strip().endswith("..")): 169 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) 170 assert match_descr.startswith(line[1:].split(None,1)[0]) 171 172 #Tagged data, 173 line = handle.readline() 174 line = self._parse_tag_section(line, match_annotation) 175 assert not line[0:2] == "; " 176 177 #Now should have the aligned query sequence with flanking region... 178 #but before that, since FASTA 35.4.1 there can be an consensus here, 179 """ 180 ; al_cons: 181 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: 182 etc 183 """ 184 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 185 match_seq_parts.append(line.strip()) 186 line = handle.readline() 187 if line[0:2] == "; " : 188 assert line.strip() == "; al_cons:" 189 align_consensus_parts = [] 190 line = handle.readline() 191 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 192 align_consensus_parts.append(line.strip()) 193 line = handle.readline() 194 #If we do anything with this in future, must remove any flanking region. 195 align_consensus = "".join(align_consensus_parts) 196 del align_consensus_parts 197 assert not line[0:2] == "; " 198 else : 199 align_consensus = None 200 assert (line[0] == ">" or ">>>" in line) 201 self._header = line 202 203 #We built a list of strings and then joined them because 204 #its faster than appending to a string. 205 query_seq = "".join(query_seq_parts) 206 match_seq = "".join(match_seq_parts) 207 del query_seq_parts, match_seq_parts 208 #Note, query_seq and match_seq will usually be of different lengths, apparently 209 #because in the m10 format leading gaps are added but not trailing gaps! 210 211 #Remove the flanking regions, 212 query_align_seq = self._extract_alignment_region(query_seq, query_annotation) 213 match_align_seq = self._extract_alignment_region(match_seq, match_annotation) 214 #How can we do this for the (optional) consensus? 215 216 #The "sq_offset" values can be specified with the -X command line option. 217 #They appear to just shift the origin used in the calculation of the coordinates. 218 219 if len(query_align_seq) != len(match_align_seq) : 220 raise ValueError("Problem parsing the alignment sequence coordinates, " 221 "following should be the same length but are not:\n" 222 "%s - len %i\n%s - len %i" % (query_align_seq, 223 len(query_align_seq), 224 match_align_seq, 225 len(match_align_seq))) 226 if "sw_overlap" in alignment_annotation : 227 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : 228 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ 229 % (alignment_annotation["sw_overlap"], 230 len(query_align_seq))) 231 232 #TODO - Look at the "sq_type" to assign a sensible alphabet? 233 alphabet = self.alphabet 234 alignment = Alignment(alphabet) 235 236 #TODO - Introduce an annotated alignment class? 237 #For now, store the annotation a new private property: 238 alignment._annotations = {} 239 240 #Want to record both the query header tags, and the alignment tags. 241 for key, value in self._query_header_annotation.iteritems() : 242 alignment._annotations[key] = value 243 for key, value in alignment_annotation.iteritems() : 244 alignment._annotations[key] = value 245 246 247 #TODO - Once the alignment object gets an append method, use it. 248 #(i.e. an add SeqRecord method) 249 alignment.add_sequence(self._query_descr, query_align_seq) 250 record = alignment.get_all_seqs()[-1] 251 assert record.id == self._query_descr or record.description == self._query_descr 252 #assert record.seq.tostring() == query_align_seq 253 record.id = self._query_descr.split(None,1)[0].strip(",") 254 record.name = "query" 255 record.annotations["original_length"] = int(query_annotation["sq_len"]) 256 257 #TODO - What if a specific alphabet has been requested? 258 #TODO - Use an IUPAC alphabet? 259 #TODO - Can FASTA output RNA? 260 if alphabet == single_letter_alphabet and "sq_type" in query_annotation : 261 if query_annotation["sq_type"] == "D" : 262 record.seq.alphabet = generic_dna 263 elif query_annotation["sq_type"] == "p" : 264 record.seq.alphabet = generic_protein 265 if "-" in query_align_seq : 266 if not hasattr(record.seq.alphabet,"gap_char") : 267 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 268 269 alignment.add_sequence(match_descr, match_align_seq) 270 record = alignment.get_all_seqs()[-1] 271 assert record.id == match_descr or record.description == match_descr 272 #assert record.seq.tostring() == match_align_seq 273 record.id = match_descr.split(None,1)[0].strip(",") 274 record.name = "match" 275 record.annotations["original_length"] = int(match_annotation["sq_len"]) 276 277 #This is still a very crude way of dealing with the alphabet: 278 if alphabet == single_letter_alphabet and "sq_type" in match_annotation : 279 if match_annotation["sq_type"] == "D" : 280 record.seq.alphabet = generic_dna 281 elif match_annotation["sq_type"] == "p" : 282 record.seq.alphabet = generic_protein 283 if "-" in match_align_seq : 284 if not hasattr(record.seq.alphabet,"gap_char") : 285 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 286 287 return alignment
288
289 - def _skip_file_header(self, line) :
290 """Helper function for the main parsing code. 291 292 Skips over the file header region. 293 """ 294 #e.g. This region: 295 """ 296 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa 297 FASTA searches a protein or DNA sequence data bank 298 version 35.03 Feb. 18, 2008 299 Please cite: 300 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 301 302 Query: NC_002127.faa 303 """ 304 #Note that there is no point recording the command line here 305 #from the # line, as it is included again in each alignment 306 #under the "pg_argv" tag. Likewise for the program version. 307 while ">>>" not in line : 308 line = self.handle.readline() 309 return line
310
311 - def _parse_query_header(self, line) :
312 """Helper function for the main parsing code. 313 314 Skips over the free format query header, extracting the tagged parameters. 315 316 If there are no hits for the current query, it is skipped entirely.""" 317 #e.g. this region (where there is often a histogram too): 318 """ 319 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 320 Library: NC_009649.faa 45119 residues in 180 sequences 321 322 45119 residues in 180 sequences 323 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 324 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 325 Lambda= 0.191631 326 Algorithm: FASTA (3.5 Sept 2006) [optimized] 327 Parameters: BL50 matrix (15:-5) ktup: 2 328 join: 36, opt: 24, open/ext: -10/-2, width: 16 329 Scan time: 0.040 330 331 The best scores are: opt bits E(180) 332 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22 333 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93 334 """ 335 #Sometimes have queries with no matches, in which case we continue to the 336 #next query block: 337 """ 338 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa 339 vs NC_002127.faa library 340 341 579 residues in 3 sequences 342 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100 343 344 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 345 join: 36, opt: 24, open/ext: -10/-2, width: 16 346 Scan time: 0.000 347 !! No library sequences with E() < 0.5 348 """ 349 350 self._query_header_annotation = {} 351 self._query_descr = "" 352 353 assert ">>>" in line and not line[0:3] == ">>>" 354 #There is nothing useful in this line, the query description is truncated. 355 356 line = self.handle.readline() 357 #We ignore the free form text... 358 while not line[0:3] == ">>>" : 359 #print "Ignoring %s" % line.strip() 360 line = self.handle.readline() 361 if not line : 362 raise ValueError("Premature end of file!") 363 if ">>><<<" in line : 364 #End of alignments, looks like the last query 365 #or queries had no hits. 366 return line 367 368 #Now want to parse this section: 369 """ 370 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 371 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 372 ; pg_ver: 35.03 373 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa 374 ; pg_name: FASTA 375 ; pg_ver: 3.5 Sept 2006 376 ; pg_matrix: BL50 (15:-5) 377 ; pg_open-ext: -10 -2 378 ; pg_ktup: 2 379 ; pg_optcut: 24 380 ; pg_cgap: 36 381 ; mp_extrap: 60000 500 382 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631 383 ; mp_KS: -0.0000 (N=1066338402) at 20 384 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized] 385 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16 386 """ 387 388 assert line[0:3] == ">>>", line 389 self._query_descr = line[3:].strip() 390 391 #Handle the following "program" tagged data, 392 line = self.handle.readline() 393 line = self._parse_tag_section(line, self._query_header_annotation) 394 assert not line[0:2] == "; ", line 395 assert line[0:2] == ">>" or ">>>" in line, line 396 return line
397 398
399 - def _extract_alignment_region(self, alignment_seq_with_flanking, annotation) :
400 """Helper function for the main parsing code. 401 402 To get the actual pairwise alignment sequences, we must first 403 translate the un-gapped sequence based coordinates into positions 404 in the gapped sequence (which may have a flanking region shown 405 using leading - characters). To date, I have never seen any 406 trailing flanking region shown in the m10 file, but the 407 following code should also cope with that. 408 409 Note that this code seems to work fine even when the "sq_offset" 410 entries are prsent as a result of using the -X command line option. 411 """ 412 align_stripped = alignment_seq_with_flanking.strip("-") 413 display_start = int(annotation['al_display_start']) 414 if int(annotation['al_start']) <= int(annotation['al_stop']) : 415 start = int(annotation['al_start']) \ 416 - display_start 417 end = int(annotation['al_stop']) \ 418 - display_start \ 419 + align_stripped.count("-") + 1 420 else : 421 #FASTA has flipped this sequence... 422 start = display_start \ 423 - int(annotation['al_start']) 424 end = display_start \ 425 - int(annotation['al_stop']) \ 426 + align_stripped.count("-") + 1 427 assert 0 <= start and start < end and end <= len(align_stripped), \ 428 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 429 % (alignment_seq_with_flanking, start, end, annotation) 430 return align_stripped[start:end]
431 432
433 - def _parse_tag_section(self, line, dictionary) :
434 """Helper function for the main parsing code. 435 436 line - supply line just read from the handle containing the start of 437 the tagged section. 438 dictionary - where to record the tagged values 439 440 Returns a string, the first line following the tagged section.""" 441 if not line[0:2] == "; " : 442 raise ValueError("Expected line starting '; '") 443 while line[0:2] == "; " : 444 tag, value = line[2:].strip().split(": ",1) 445 #fasta34 and early versions of fasta35 will reuse the pg_name and 446 #pg_ver tags for the program executable and name, and the program 447 #version and the algorithm version, respectively. This is fixed 448 #in FASTA 35.4.1, but we can't assume the tags are unique: 449 #if tag in dictionary : 450 # raise ValueError("Repeated tag '%s' in section" % tag) 451 dictionary[tag] = value 452 line = self.handle.readline() 453 return line
454 455 if __name__ == "__main__" : 456 print "Running a quick self-test" 457 458 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 459 simple_example = \ 460 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 461 FASTA searches a protein or DNA sequence data bank 462 version 34.26 January 12, 2007 463 Please cite: 464 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 465 466 Query library NC_002127.faa vs NC_009649.faa library 467 searching NC_009649.faa library 468 469 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 470 vs NC_009649.faa library 471 472 45119 residues in 180 sequences 473 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 474 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 475 Lambda= 0.175043 476 477 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 478 join: 36, opt: 24, open/ext: -10/-2, width: 16 479 Scan time: 0.000 480 The best scores are: opt bits E(180) 481 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 482 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 483 484 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 485 ; pg_name: /opt/fasta/fasta34 486 ; pg_ver: 34.26 487 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 488 ; pg_name: FASTA 489 ; pg_ver: 3.5 Sept 2006 490 ; pg_matrix: BL50 (15:-5) 491 ; pg_open-ext: -10 -2 492 ; pg_ktup: 2 493 ; pg_optcut: 24 494 ; pg_cgap: 36 495 ; mp_extrap: 60000 180 496 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 497 ; mp_KS: -0.0000 (N=0) at 8159228 498 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 499 ; fa_frame: f 500 ; fa_initn: 65 501 ; fa_init1: 43 502 ; fa_opt: 71 503 ; fa_z-score: 90.3 504 ; fa_bits: 24.9 505 ; fa_expect: 0.58 506 ; sw_score: 71 507 ; sw_ident: 0.250 508 ; sw_sim: 0.574 509 ; sw_overlap: 108 510 >gi|10955263| .. 511 ; sq_len: 107 512 ; sq_offset: 1 513 ; sq_type: p 514 ; al_start: 5 515 ; al_stop: 103 516 ; al_display_start: 1 517 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 518 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 519 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 520 >gi|152973457|ref|YP_001338508.1| .. 521 ; sq_len: 931 522 ; sq_type: p 523 ; al_start: 96 524 ; al_stop: 195 525 ; al_display_start: 66 526 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 527 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 528 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 529 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 530 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 531 ; fa_frame: f 532 ; fa_initn: 33 533 ; fa_init1: 33 534 ; fa_opt: 63 535 ; fa_z-score: 86.1 536 ; fa_bits: 23.1 537 ; fa_expect: 0.99 538 ; sw_score: 63 539 ; sw_ident: 0.266 540 ; sw_sim: 0.656 541 ; sw_overlap: 64 542 >gi|10955263| .. 543 ; sq_len: 107 544 ; sq_offset: 1 545 ; sq_type: p 546 ; al_start: 32 547 ; al_stop: 94 548 ; al_display_start: 2 549 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 550 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 551 LSRLMAD 552 >gi|152973588|ref|YP_001338639.1| .. 553 ; sq_len: 459 554 ; sq_type: p 555 ; al_start: 191 556 ; al_stop: 248 557 ; al_display_start: 161 558 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 559 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 560 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 561 SNKALKSQISALLSSIQNKAVADEKLTDQE 562 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 563 vs NC_009649.faa library 564 565 45119 residues in 180 sequences 566 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 567 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 568 Lambda= 0.179384 569 570 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 571 join: 36, opt: 24, open/ext: -10/-2, width: 16 572 Scan time: 0.000 573 The best scores are: opt bits E(180) 574 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 575 576 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 577 ; pg_name: /opt/fasta/fasta34 578 ; pg_ver: 34.26 579 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 580 ; pg_name: FASTA 581 ; pg_ver: 3.5 Sept 2006 582 ; pg_matrix: BL50 (15:-5) 583 ; pg_open-ext: -10 -2 584 ; pg_ktup: 2 585 ; pg_optcut: 24 586 ; pg_cgap: 36 587 ; mp_extrap: 60000 180 588 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 589 ; mp_KS: -0.0000 (N=0) at 8159228 590 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 591 ; fa_frame: f 592 ; fa_initn: 50 593 ; fa_init1: 50 594 ; fa_opt: 58 595 ; fa_z-score: 95.8 596 ; fa_bits: 22.9 597 ; fa_expect: 0.29 598 ; sw_score: 58 599 ; sw_ident: 0.289 600 ; sw_sim: 0.632 601 ; sw_overlap: 38 602 >gi|10955264| .. 603 ; sq_len: 126 604 ; sq_offset: 1 605 ; sq_type: p 606 ; al_start: 1 607 ; al_stop: 38 608 ; al_display_start: 1 609 ------------------------------MKKDKKYQIEAIKNKDKTLF 610 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 611 NGEKFSSYTLNKVTKTDEYN 612 >gi|152973462|ref|YP_001338513.1| .. 613 ; sq_len: 101 614 ; sq_type: p 615 ; al_start: 44 616 ; al_stop: 81 617 ; al_display_start: 14 618 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 619 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 620 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 621 vs NC_009649.faa library 622 623 45119 residues in 180 sequences 624 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 625 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 626 Lambda= 0.210386 627 628 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 629 join: 37, opt: 25, open/ext: -10/-2, width: 16 630 Scan time: 0.020 631 The best scores are: opt bits E(180) 632 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 633 634 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 635 ; pg_name: /opt/fasta/fasta34 636 ; pg_ver: 34.26 637 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 638 ; pg_name: FASTA 639 ; pg_ver: 3.5 Sept 2006 640 ; pg_matrix: BL50 (15:-5) 641 ; pg_open-ext: -10 -2 642 ; pg_ktup: 2 643 ; pg_optcut: 25 644 ; pg_cgap: 37 645 ; mp_extrap: 60000 180 646 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 647 ; mp_KS: -0.0000 (N=0) at 8159228 648 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 649 ; fa_frame: f 650 ; fa_initn: 52 651 ; fa_init1: 52 652 ; fa_opt: 70 653 ; fa_z-score: 105.5 654 ; fa_bits: 27.5 655 ; fa_expect: 0.082 656 ; sw_score: 70 657 ; sw_ident: 0.279 658 ; sw_sim: 0.651 659 ; sw_overlap: 43 660 >gi|10955265| .. 661 ; sq_len: 346 662 ; sq_offset: 1 663 ; sq_type: p 664 ; al_start: 197 665 ; al_stop: 238 666 ; al_display_start: 167 667 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 668 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 669 GEYFTENKPKYIIREIHQET 670 >gi|152973545|ref|YP_001338596.1| .. 671 ; sq_len: 242 672 ; sq_type: p 673 ; al_start: 52 674 ; al_stop: 94 675 ; al_display_start: 22 676 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 677 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 678 QDFAFTRKMRREARQVEQSW 679 >>><<< 680 681 682 579 residues in 3 query sequences 683 45119 residues in 180 library sequences 684 Scomplib [34.26] 685 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 686 Total Scan time: 0.020 Total Display time: 0.010 687 688 Function used was FASTA [version 34.26 January 12, 2007] 689 690 """ 691 692 693 from StringIO import StringIO 694 695 alignments = list(FastaM10Iterator(StringIO(simple_example))) 696 assert len(alignments) == 4, len(alignments) 697 assert len(alignments[0].get_all_seqs()) == 2 698 for a in alignments : 699 print "Alignment %i sequences of length %i" \ 700 % (len(a.get_all_seqs()), a.get_alignment_length()) 701 for r in a : 702 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 703 #print a.annotations 704 print "Done" 705 706 import os 707 path = "../../Tests/Fasta/" 708 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 709 files.sort() 710 for filename in files : 711 if os.path.splitext(filename)[-1] == ".m10" : 712 print 713 print filename 714 print "="*len(filename) 715 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))): 716 print "#%i, %s" % (i+1,a) 717 for r in a : 718 if "-" in r.seq : 719 assert r.seq.alphabet.gap_char == "-" 720 else : 721 assert not hasattr(r.seq.alphabet, "gap_char") 722