Package Bio :: Package AlignIO :: Module EmbossIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.EmbossIO

  1  # Copyright 2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from Bio.Align.Generic import Alignment 
  8  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
  9   
10 -class EmbossWriter(SequentialAlignmentWriter) :
11 """Emboss alignment writer 12 13 Writes a simplfied version of the EMBOSS pairs/simple file format. 14 A lot of the information their tools record in their headers is not 15 available and is ommitted. 16 """ 17
18 - def write_header(self) :
19 handle = self.handle 20 handle.write("########################################\n") 21 handle.write("# Program: Biopython\n") 22 try : 23 handle.write("# Report_file: %s\n" % handle.name) 24 except AttributeError : 25 pass 26 handle.write("########################################\n")
27 32
33 - def write_alignment(self, alignment) :
34 """Use this to write (another) single alignment to an open file.""" 35 36 handle = self.handle 37 records = alignment.get_all_seqs() 38 39 handle.write("#=======================================\n") 40 handle.write("#\n") 41 handle.write("# Aligned_sequences: %i\n" % len(records)) 42 for i, record in enumerate(records) : 43 handle.write("# %i: %s\n" % (i+1, record.id)) 44 handle.write("#\n") 45 handle.write("# Length: %i\n" % alignment.get_alignment_length()) 46 handle.write("#\n") 47 handle.write("#=======================================\n") 48 handle.write("\n") 49 #... 50 assert False
51
52 -class EmbossIterator(AlignmentIterator) :
53 """Emboss alignment iterator 54 55 For reading the (pairwise) alignments from EMBOSS tools in what they 56 call the "pairs" and "simple" formats. 57 """ 58
59 - def next(self) :
60 61 handle = self.handle 62 63 try : 64 #Header we saved from when we were parsing 65 #the previous alignment. 66 line = self._header 67 del self._header 68 except AttributeError: 69 line = handle.readline() 70 if not line: 71 return None 72 73 while line.rstrip() <> "#=======================================" : 74 line = handle.readline() 75 if not line : 76 return None 77 78 length_of_seqs = None 79 number_of_seqs = None 80 ids = [] 81 seqs = [] 82 83 84 while line[0] == "#" : 85 #Read in the rest of this alignment header, 86 #try and discover the number of records expected 87 #and their length 88 parts = line[1:].split(":",1) 89 key = parts[0].lower().strip() 90 if key == "aligned_sequences" : 91 number_of_seqs = int(parts[1].strip()) 92 assert len(ids) == 0 93 # Should now expect the record identifiers... 94 for i in range(number_of_seqs) : 95 line = handle.readline() 96 parts = line[1:].strip().split(":",1) 97 assert i+1 == int(parts[0].strip()) 98 ids.append(parts[1].strip()) 99 assert len(ids) == number_of_seqs 100 if key == "length" : 101 length_of_seqs = int(parts[1].strip()) 102 103 #And read in another line... 104 line = handle.readline() 105 106 if number_of_seqs is None : 107 raise SyntaxError("Number of sequences missing!") 108 if length_of_seqs is None : 109 raise SyntaxError("Length of sequences missing!") 110 111 if self.records_per_alignment is not None \ 112 and self.records_per_alignment <> number_of_seqs : 113 raise ValueError("Found %i records in this alignment, told to expect %i" \ 114 % (number_of_seqs, self.records_per_alignment)) 115 116 seqs = ["" for id in ids] 117 index = 0 118 119 #Parse the seqs 120 while line : 121 if len(line) > 21 : 122 id_start = line[:21].strip().split(None, 1) 123 seq_end = line[21:].strip().split(None, 1) 124 if len(id_start) == 2 and len(seq_end) == 2: 125 #identifier, seq start position, seq, seq end position 126 #(an aligned seq is broken up into multiple lines) 127 id, start = id_start 128 seq, end = seq_end 129 130 #The identifier is truncated... 131 assert 0 <= index and index < number_of_seqs, \ 132 "Expected index %i in range [0,%i)" \ 133 % (index, number_of_seqs) 134 assert id==ids[index] or id == ids[index][:len(id)] 135 136 #Check the start... 137 assert int(start) - 1 == len(seqs[index].replace("-","")), \ 138 "Found %i chars so far for %s, file says start %i:\n%s" \ 139 % (len(seqs[index]), id, int(start), seqs[index]) 140 141 seqs[index] += seq 142 143 #Check the end ... 144 assert int(end) == len(seqs[index].replace("-","")), \ 145 "Found %i chars so far for %s, file says end %i:\n%s" \ 146 % (len(seqs[index]), id, int(end), seqs[index]) 147 148 index += 1 149 if index >= number_of_seqs : 150 index = 0 151 else : 152 #just a start value, this is just alignment annotation (?) 153 #print "Skipping: " + line.rstrip() 154 pass 155 elif line.strip() == "" : 156 #Just a spacer? 157 pass 158 else : 159 print line 160 assert False 161 162 line = handle.readline() 163 if line.rstrip() == "#---------------------------------------" \ 164 or line.rstrip() == "#=======================================" : 165 #End of alignment 166 self._header = line 167 break 168 169 assert index == 0 170 171 if self.records_per_alignment is not None \ 172 and self.records_per_alignment <> len(ids) : 173 raise ValueError("Found %i records in this alignment, told to expect %i" \ 174 % (len(ids), self.records_per_alignment)) 175 176 alignment = Alignment(self.alphabet) 177 for id, seq in zip(ids, seqs) : 178 if len(seq) <> length_of_seqs : 179 raise SyntaxError("Error parsing alignment - sequences of different length?") 180 alignment.add_sequence(id, seq) 181 return alignment
182 183 if __name__ == "__main__" : 184 print "Running a quick self-test" 185 186 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 187 simple_example = \ 188 """######################################## 189 # Program: alignret 190 # Rundate: Wed Jan 16 17:16:13 2002 191 # Report_file: stdout 192 ######################################## 193 #======================================= 194 # 195 # Aligned_sequences: 4 196 # 1: IXI_234 197 # 2: IXI_235 198 # 3: IXI_236 199 # 4: IXI_237 200 # Matrix: EBLOSUM62 201 # Gap_penalty: 10.0 202 # Extend_penalty: 0.5 203 # 204 # Length: 131 205 # Identity: 95/131 (72.5%) 206 # Similarity: 127/131 (96.9%) 207 # Gaps: 25/131 (19.1%) 208 # Score: 100.0 209 # 210 # 211 #======================================= 212 213 IXI_234 1 TSPASIRPPAGPSSRPAMVSSRRTRPSPPGPRRPTGRPCCSAAPRRPQAT 50 214 IXI_235 1 TSPASIRPPAGPSSR---------RPSPPGPRRPTGRPCCSAAPRRPQAT 41 215 IXI_236 1 TSPASIRPPAGPSSRPAMVSSR--RPSPPPPRRPPGRPCCSAAPPRPQAT 48 216 IXI_237 1 TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRPT----CSAAPRRPQAT 45 217 |||||:|||||||||::::::: |||||:||||:::::|||||:||||| 218 219 IXI_234 51 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSRSAG 100 220 IXI_235 42 GGWKTCSGTCTTSTSTRHRGRSGW----------RASRKSMRAACSRSAG 81 221 IXI_236 49 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSR--G 96 222 IXI_237 46 GGYKTCSGTCTTSTSTRHRGRSGYSARTTTAACLRASRKSMRAACSR--G 93 223 ||:||||||||||||||||||||:::::::::::||||||||||||| | 224 225 IXI_234 101 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 131 226 IXI_235 82 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 112 227 IXI_236 97 SRPPRFAPPLMSSCITSTTGPPPPAGDRSHE 127 228 IXI_237 94 SRPNRFAPTLMSSCLTSTTGPPAYAGDRSHE 124 229 |||:||||:|||||:|||||||::||||||| 230 231 232 #--------------------------------------- 233 #--------------------------------------- 234 235 """ 236 237 #http://emboss.sourceforge.net/docs/themes/alnformats/align.pair 238 pair_example = \ 239 """######################################## 240 # Program: water 241 # Rundate: Wed Jan 16 17:23:19 2002 242 # Report_file: stdout 243 ######################################## 244 #======================================= 245 # 246 # Aligned_sequences: 2 247 # 1: IXI_234 248 # 2: IXI_235 249 # Matrix: EBLOSUM62 250 # Gap_penalty: 10.0 251 # Extend_penalty: 0.5 252 # 253 # Length: 131 254 # Identity: 112/131 (85.5%) 255 # Similarity: 112/131 (85.5%) 256 # Gaps: 19/131 (14.5%) 257 # Score: 591.5 258 # 259 # 260 #======================================= 261 262 IXI_234 1 TSPASIRPPAGPSSRPAMVSSRRTRPSPPGPRRPTGRPCCSAAPRRPQAT 50 263 ||||||||||||||| |||||||||||||||||||||||||| 264 IXI_235 1 TSPASIRPPAGPSSR---------RPSPPGPRRPTGRPCCSAAPRRPQAT 41 265 266 IXI_234 51 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSRSAG 100 267 |||||||||||||||||||||||| |||||||||||||||| 268 IXI_235 42 GGWKTCSGTCTTSTSTRHRGRSGW----------RASRKSMRAACSRSAG 81 269 270 IXI_234 101 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 131 271 ||||||||||||||||||||||||||||||| 272 IXI_235 82 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 112 273 274 275 #--------------------------------------- 276 #--------------------------------------- 277 278 279 """ 280 281 pair_example2 = \ 282 """######################################## 283 # Program: needle 284 # Rundate: Sun 27 Apr 2007 17:20:35 285 # Commandline: needle 286 # [-asequence] Spo0F.faa 287 # [-bsequence] paired_r.faa 288 # -sformat2 pearson 289 # Align_format: srspair 290 # Report_file: ref_rec .needle 291 ######################################## 292 293 #======================================= 294 # 295 # Aligned_sequences: 2 296 # 1: ref_rec 297 # 2: gi|94968718|receiver 298 # Matrix: EBLOSUM62 299 # Gap_penalty: 10.0 300 # Extend_penalty: 0.5 301 # 302 # Length: 124 303 # Identity: 32/124 (25.8%) 304 # Similarity: 64/124 (51.6%) 305 # Gaps: 17/124 (13.7%) 306 # Score: 112.0 307 # 308 # 309 #======================================= 310 311 ref_rec 1 KILIVDD----QYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDL 46 312 :|:.|| :.|.|::|.: :.|.....:|.:|.||:.:..:..|.: 313 gi|94968718|r 1 -VLLADDHALVRRGFRLMLED--DPEIEIVAEAGDGAQAVKLAGELHPRV 47 314 315 ref_rec 47 VLLDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALT 96 316 |::|..:|||.|::..|:::....:|.|:::|.:.|...::.:.|.||.. 317 gi|94968718|r 48 VVMDCAMPGMSGMDATKQIRTQWPDIAVLMLTMHSEDTWVRLALEAGANG 97 318 319 ref_rec 97 HFAK-PFDIDEIRDAV-------- 111 320 :..| ..|:|.|: || 321 gi|94968718|r 98 YILKSAIDLDLIQ-AVRRVANGET 120 322 323 324 #======================================= 325 # 326 # Aligned_sequences: 2 327 # 1: ref_rec 328 # 2: gi|94968761|receiver 329 # Matrix: EBLOSUM62 330 # Gap_penalty: 10.0 331 # Extend_penalty: 0.5 332 # 333 # Length: 119 334 # Identity: 34/119 (28.6%) 335 # Similarity: 58/119 (48.7%) 336 # Gaps: 9/119 ( 7.6%) 337 # Score: 154.0 338 # 339 # 340 #======================================= 341 342 ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDLVLLD 50 343 ||||||:......|:..|...|::.....|.::||:|...:..||:|.| 344 gi|94968761|r 1 -ILIVDDEANTLASLSRAFRLAGHEATVCDNAVRALEIAKSKPFDLILSD 49 345 346 ref_rec 51 MKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHFAK 100 347 :.:||.||:.:|:.:|.......|::|:....::|..::..||||....| 348 gi|94968761|r 50 VVMPGRDGLTLLEDLKTAGVQAPVVMMSGQAHIEMAVKATRLGALDFLEK 99 349 350 ref_rec 101 PFDIDEIRDAV-------- 111 351 |...|::...| 352 gi|94968761|r 100 PLSTDKLLLTVENALKLKR 118 353 354 355 #======================================= 356 # 357 # Aligned_sequences: 2 358 # 1: ref_rec 359 # 2: gi|94967506|receiver 360 # Matrix: EBLOSUM62 361 # Gap_penalty: 10.0 362 # Extend_penalty: 0.5 363 # 364 # Length: 120 365 # Identity: 29/120 (24.2%) 366 # Similarity: 53/120 (44.2%) 367 # Gaps: 9/120 ( 7.5%) 368 # Score: 121.0 369 # 370 # 371 #======================================= 372 373 ref_rec 1 -KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDLVLL 49 374 .|::|||..|..:.:..||.:.|:..........|.:.:.....||.:: 375 gi|94967506|r 1 LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHPVDLAIV 50 376 377 ref_rec 50 DMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHFA 99 378 |:.:....|:|:|:|.:|....:..:|:|....|:|...|...||:.:.. 379 gi|94967506|r 51 DVYLGSTTGVEVLRRCRVHRPKLYAVIITGQISLEMAARSIAEGAVDYIQ 100 380 381 ref_rec 100 KPFDIDEIRDAV-------- 111 382 ||.|||.:.:.. 383 gi|94967506|r 101 KPIDIDALLNIAERALEHKE 120 384 385 386 #======================================= 387 # 388 # Aligned_sequences: 2 389 # 1: ref_rec 390 # 2: gi|94970045|receiver 391 # Matrix: EBLOSUM62 392 # Gap_penalty: 10.0 393 # Extend_penalty: 0.5 394 # 395 # Length: 118 396 # Identity: 30/118 (25.4%) 397 # Similarity: 64/118 (54.2%) 398 # Gaps: 9/118 ( 7.6%) 399 # Score: 126.0 400 # 401 # 402 #======================================= 403 404 ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTK--ERPDLVL 48 405 :|:|:|:..:|....:.....||:...|.:|.:||.:.:| ||.|::: 406 gi|94970045|r 1 -VLLVEDEEALRAAAGDFLETRGYKIMTARDGTEALSMASKFAERIDVLI 49 407 408 ref_rec 49 LDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHF 98 409 .|:.:||:.|..:.:.:..|....:|:.|:.|.: :.:..:.|:.:.:.| 410 gi|94970045|r 50 TDLVMPGISGRVLAQELVKIHPETKVMYMSGYDD-ETVMVNGEIDSSSAF 98 411 412 ref_rec 99 -AKPFDID----EIRDAV 111 413 .|||.:| :||:.: 414 gi|94970045|r 99 LRKPFRMDALSAKIREVL 116 415 416 417 #======================================= 418 # 419 # Aligned_sequences: 2 420 # 1: ref_rec 421 # 2: gi|94970041|receiver 422 # Matrix: EBLOSUM62 423 # Gap_penalty: 10.0 424 # Extend_penalty: 0.5 425 # 426 # Length: 125 427 # Identity: 35/125 (28.0%) 428 # Similarity: 70/125 (56.0%) 429 # Gaps: 18/125 (14.4%) 430 # Score: 156.5 431 # 432 # 433 #======================================= 434 435 ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIV--TKERPDLVL 48 436 .:|:|:|:.|:|.|:..:.:::||...:|.:|.:||:|| :.::.|::| 437 gi|94970041|r 1 TVLLVEDEEGVRKLVRGILSRQGYHVLEATSGEEALEIVRESTQKIDMLL 50 438 439 ref_rec 49 LDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHF 98 440 .|:.:.||.|.|:.:|:::...:::||.|:.|.:..:::. |.||.. 441 gi|94970041|r 51 SDVVLVGMSGRELSERLRIQMPSLKVIYMSGYTDDAIVRH----GVLTES 96 442 443 ref_rec 99 A----KPFDIDEIRDAV-------- 111 444 | |||..|.:...| 445 gi|94970041|r 97 AEFLQKPFTSDSLLRKVRAVLQKRQ 121 446 447 448 #--------------------------------------- 449 #--------------------------------------- 450 451 """ 452 453 454 from StringIO import StringIO 455 456 alignments = list(EmbossIterator(StringIO(pair_example))) 457 assert len(alignments) == 1 458 assert len(alignments[0].get_all_seqs()) == 2 459 assert [r.id for r in alignments[0].get_all_seqs()] \ 460 == ["IXI_234", "IXI_235"] 461 462 alignments = list(EmbossIterator(StringIO(simple_example))) 463 assert len(alignments) == 1 464 assert len(alignments[0].get_all_seqs()) == 4 465 assert [r.id for r in alignments[0].get_all_seqs()] \ 466 == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] 467 468 alignments = list(EmbossIterator(StringIO(pair_example + simple_example))) 469 assert len(alignments) == 2 470 assert len(alignments[0].get_all_seqs()) == 2 471 assert len(alignments[1].get_all_seqs()) == 4 472 assert [r.id for r in alignments[0].get_all_seqs()] \ 473 == ["IXI_234", "IXI_235"] 474 assert [r.id for r in alignments[1].get_all_seqs()] \ 475 == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] 476 477 478 #for a in EmbossIterator(StringIO(pair_example2)) : 479 # print "Next:" 480 # for r in a.get_all_seqs() : 481 # print r.seq.tostring()[:20] + "...", r.id 482 483 alignments = list(EmbossIterator(StringIO(pair_example2))) 484 assert len(alignments) == 5 485 assert len(alignments[0].get_all_seqs()) == 2 486 assert [r.id for r in alignments[0].get_all_seqs()] \ 487 == ["ref_rec", "gi|94968718|receiver"] 488 assert [r.id for r in alignments[4].get_all_seqs()] \ 489 == ["ref_rec", "gi|94970041|receiver"] 490 491 print "Done" 492