1
2
3
4
5
6
7 from Bio.Align.Generic import Alignment
8 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
9
11 """Emboss alignment writer
12
13 Writes a simplfied version of the EMBOSS pairs/simple file format.
14 A lot of the information their tools record in their headers is not
15 available and is ommitted.
16 """
17
27
32
51
53 """Emboss alignment iterator
54
55 For reading the (pairwise) alignments from EMBOSS tools in what they
56 call the "pairs" and "simple" formats.
57 """
58
60
61 handle = self.handle
62
63 try :
64
65
66 line = self._header
67 del self._header
68 except AttributeError:
69 line = handle.readline()
70 if not line:
71 return None
72
73 while line.rstrip() <> "#=======================================" :
74 line = handle.readline()
75 if not line :
76 return None
77
78 length_of_seqs = None
79 number_of_seqs = None
80 ids = []
81 seqs = []
82
83
84 while line[0] == "#" :
85
86
87
88 parts = line[1:].split(":",1)
89 key = parts[0].lower().strip()
90 if key == "aligned_sequences" :
91 number_of_seqs = int(parts[1].strip())
92 assert len(ids) == 0
93
94 for i in range(number_of_seqs) :
95 line = handle.readline()
96 parts = line[1:].strip().split(":",1)
97 assert i+1 == int(parts[0].strip())
98 ids.append(parts[1].strip())
99 assert len(ids) == number_of_seqs
100 if key == "length" :
101 length_of_seqs = int(parts[1].strip())
102
103
104 line = handle.readline()
105
106 if number_of_seqs is None :
107 raise SyntaxError("Number of sequences missing!")
108 if length_of_seqs is None :
109 raise SyntaxError("Length of sequences missing!")
110
111 if self.records_per_alignment is not None \
112 and self.records_per_alignment <> number_of_seqs :
113 raise ValueError("Found %i records in this alignment, told to expect %i" \
114 % (number_of_seqs, self.records_per_alignment))
115
116 seqs = ["" for id in ids]
117 index = 0
118
119
120 while line :
121 if len(line) > 21 :
122 id_start = line[:21].strip().split(None, 1)
123 seq_end = line[21:].strip().split(None, 1)
124 if len(id_start) == 2 and len(seq_end) == 2:
125
126
127 id, start = id_start
128 seq, end = seq_end
129
130
131 assert 0 <= index and index < number_of_seqs, \
132 "Expected index %i in range [0,%i)" \
133 % (index, number_of_seqs)
134 assert id==ids[index] or id == ids[index][:len(id)]
135
136
137 assert int(start) - 1 == len(seqs[index].replace("-","")), \
138 "Found %i chars so far for %s, file says start %i:\n%s" \
139 % (len(seqs[index]), id, int(start), seqs[index])
140
141 seqs[index] += seq
142
143
144 assert int(end) == len(seqs[index].replace("-","")), \
145 "Found %i chars so far for %s, file says end %i:\n%s" \
146 % (len(seqs[index]), id, int(end), seqs[index])
147
148 index += 1
149 if index >= number_of_seqs :
150 index = 0
151 else :
152
153
154 pass
155 elif line.strip() == "" :
156
157 pass
158 else :
159 print line
160 assert False
161
162 line = handle.readline()
163 if line.rstrip() == "#---------------------------------------" \
164 or line.rstrip() == "#=======================================" :
165
166 self._header = line
167 break
168
169 assert index == 0
170
171 if self.records_per_alignment is not None \
172 and self.records_per_alignment <> len(ids) :
173 raise ValueError("Found %i records in this alignment, told to expect %i" \
174 % (len(ids), self.records_per_alignment))
175
176 alignment = Alignment(self.alphabet)
177 for id, seq in zip(ids, seqs) :
178 if len(seq) <> length_of_seqs :
179 raise SyntaxError("Error parsing alignment - sequences of different length?")
180 alignment.add_sequence(id, seq)
181 return alignment
182
183 if __name__ == "__main__" :
184 print "Running a quick self-test"
185
186
187 simple_example = \
188 """########################################
189 # Program: alignret
190 # Rundate: Wed Jan 16 17:16:13 2002
191 # Report_file: stdout
192 ########################################
193 #=======================================
194 #
195 # Aligned_sequences: 4
196 # 1: IXI_234
197 # 2: IXI_235
198 # 3: IXI_236
199 # 4: IXI_237
200 # Matrix: EBLOSUM62
201 # Gap_penalty: 10.0
202 # Extend_penalty: 0.5
203 #
204 # Length: 131
205 # Identity: 95/131 (72.5%)
206 # Similarity: 127/131 (96.9%)
207 # Gaps: 25/131 (19.1%)
208 # Score: 100.0
209 #
210 #
211 #=======================================
212
213 IXI_234 1 TSPASIRPPAGPSSRPAMVSSRRTRPSPPGPRRPTGRPCCSAAPRRPQAT 50
214 IXI_235 1 TSPASIRPPAGPSSR---------RPSPPGPRRPTGRPCCSAAPRRPQAT 41
215 IXI_236 1 TSPASIRPPAGPSSRPAMVSSR--RPSPPPPRRPPGRPCCSAAPPRPQAT 48
216 IXI_237 1 TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRPT----CSAAPRRPQAT 45
217 |||||:|||||||||::::::: |||||:||||:::::|||||:|||||
218
219 IXI_234 51 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSRSAG 100
220 IXI_235 42 GGWKTCSGTCTTSTSTRHRGRSGW----------RASRKSMRAACSRSAG 81
221 IXI_236 49 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSR--G 96
222 IXI_237 46 GGYKTCSGTCTTSTSTRHRGRSGYSARTTTAACLRASRKSMRAACSR--G 93
223 ||:||||||||||||||||||||:::::::::::||||||||||||| |
224
225 IXI_234 101 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 131
226 IXI_235 82 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 112
227 IXI_236 97 SRPPRFAPPLMSSCITSTTGPPPPAGDRSHE 127
228 IXI_237 94 SRPNRFAPTLMSSCLTSTTGPPAYAGDRSHE 124
229 |||:||||:|||||:|||||||::|||||||
230
231
232 #---------------------------------------
233 #---------------------------------------
234
235 """
236
237
238 pair_example = \
239 """########################################
240 # Program: water
241 # Rundate: Wed Jan 16 17:23:19 2002
242 # Report_file: stdout
243 ########################################
244 #=======================================
245 #
246 # Aligned_sequences: 2
247 # 1: IXI_234
248 # 2: IXI_235
249 # Matrix: EBLOSUM62
250 # Gap_penalty: 10.0
251 # Extend_penalty: 0.5
252 #
253 # Length: 131
254 # Identity: 112/131 (85.5%)
255 # Similarity: 112/131 (85.5%)
256 # Gaps: 19/131 (14.5%)
257 # Score: 591.5
258 #
259 #
260 #=======================================
261
262 IXI_234 1 TSPASIRPPAGPSSRPAMVSSRRTRPSPPGPRRPTGRPCCSAAPRRPQAT 50
263 ||||||||||||||| ||||||||||||||||||||||||||
264 IXI_235 1 TSPASIRPPAGPSSR---------RPSPPGPRRPTGRPCCSAAPRRPQAT 41
265
266 IXI_234 51 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSRSAG 100
267 |||||||||||||||||||||||| ||||||||||||||||
268 IXI_235 42 GGWKTCSGTCTTSTSTRHRGRSGW----------RASRKSMRAACSRSAG 81
269
270 IXI_234 101 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 131
271 |||||||||||||||||||||||||||||||
272 IXI_235 82 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 112
273
274
275 #---------------------------------------
276 #---------------------------------------
277
278
279 """
280
281 pair_example2 = \
282 """########################################
283 # Program: needle
284 # Rundate: Sun 27 Apr 2007 17:20:35
285 # Commandline: needle
286 # [-asequence] Spo0F.faa
287 # [-bsequence] paired_r.faa
288 # -sformat2 pearson
289 # Align_format: srspair
290 # Report_file: ref_rec .needle
291 ########################################
292
293 #=======================================
294 #
295 # Aligned_sequences: 2
296 # 1: ref_rec
297 # 2: gi|94968718|receiver
298 # Matrix: EBLOSUM62
299 # Gap_penalty: 10.0
300 # Extend_penalty: 0.5
301 #
302 # Length: 124
303 # Identity: 32/124 (25.8%)
304 # Similarity: 64/124 (51.6%)
305 # Gaps: 17/124 (13.7%)
306 # Score: 112.0
307 #
308 #
309 #=======================================
310
311 ref_rec 1 KILIVDD----QYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDL 46
312 :|:.|| :.|.|::|.: :.|.....:|.:|.||:.:..:..|.:
313 gi|94968718|r 1 -VLLADDHALVRRGFRLMLED--DPEIEIVAEAGDGAQAVKLAGELHPRV 47
314
315 ref_rec 47 VLLDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALT 96
316 |::|..:|||.|::..|:::....:|.|:::|.:.|...::.:.|.||..
317 gi|94968718|r 48 VVMDCAMPGMSGMDATKQIRTQWPDIAVLMLTMHSEDTWVRLALEAGANG 97
318
319 ref_rec 97 HFAK-PFDIDEIRDAV-------- 111
320 :..| ..|:|.|: ||
321 gi|94968718|r 98 YILKSAIDLDLIQ-AVRRVANGET 120
322
323
324 #=======================================
325 #
326 # Aligned_sequences: 2
327 # 1: ref_rec
328 # 2: gi|94968761|receiver
329 # Matrix: EBLOSUM62
330 # Gap_penalty: 10.0
331 # Extend_penalty: 0.5
332 #
333 # Length: 119
334 # Identity: 34/119 (28.6%)
335 # Similarity: 58/119 (48.7%)
336 # Gaps: 9/119 ( 7.6%)
337 # Score: 154.0
338 #
339 #
340 #=======================================
341
342 ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDLVLLD 50
343 ||||||:......|:..|...|::.....|.::||:|...:..||:|.|
344 gi|94968761|r 1 -ILIVDDEANTLASLSRAFRLAGHEATVCDNAVRALEIAKSKPFDLILSD 49
345
346 ref_rec 51 MKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHFAK 100
347 :.:||.||:.:|:.:|.......|::|:....::|..::..||||....|
348 gi|94968761|r 50 VVMPGRDGLTLLEDLKTAGVQAPVVMMSGQAHIEMAVKATRLGALDFLEK 99
349
350 ref_rec 101 PFDIDEIRDAV-------- 111
351 |...|::...|
352 gi|94968761|r 100 PLSTDKLLLTVENALKLKR 118
353
354
355 #=======================================
356 #
357 # Aligned_sequences: 2
358 # 1: ref_rec
359 # 2: gi|94967506|receiver
360 # Matrix: EBLOSUM62
361 # Gap_penalty: 10.0
362 # Extend_penalty: 0.5
363 #
364 # Length: 120
365 # Identity: 29/120 (24.2%)
366 # Similarity: 53/120 (44.2%)
367 # Gaps: 9/120 ( 7.5%)
368 # Score: 121.0
369 #
370 #
371 #=======================================
372
373 ref_rec 1 -KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDLVLL 49
374 .|::|||..|..:.:..||.:.|:..........|.:.:.....||.::
375 gi|94967506|r 1 LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHPVDLAIV 50
376
377 ref_rec 50 DMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHFA 99
378 |:.:....|:|:|:|.:|....:..:|:|....|:|...|...||:.:..
379 gi|94967506|r 51 DVYLGSTTGVEVLRRCRVHRPKLYAVIITGQISLEMAARSIAEGAVDYIQ 100
380
381 ref_rec 100 KPFDIDEIRDAV-------- 111
382 ||.|||.:.:..
383 gi|94967506|r 101 KPIDIDALLNIAERALEHKE 120
384
385
386 #=======================================
387 #
388 # Aligned_sequences: 2
389 # 1: ref_rec
390 # 2: gi|94970045|receiver
391 # Matrix: EBLOSUM62
392 # Gap_penalty: 10.0
393 # Extend_penalty: 0.5
394 #
395 # Length: 118
396 # Identity: 30/118 (25.4%)
397 # Similarity: 64/118 (54.2%)
398 # Gaps: 9/118 ( 7.6%)
399 # Score: 126.0
400 #
401 #
402 #=======================================
403
404 ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTK--ERPDLVL 48
405 :|:|:|:..:|....:.....||:...|.:|.:||.:.:| ||.|:::
406 gi|94970045|r 1 -VLLVEDEEALRAAAGDFLETRGYKIMTARDGTEALSMASKFAERIDVLI 49
407
408 ref_rec 49 LDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHF 98
409 .|:.:||:.|..:.:.:..|....:|:.|:.|.: :.:..:.|:.:.:.|
410 gi|94970045|r 50 TDLVMPGISGRVLAQELVKIHPETKVMYMSGYDD-ETVMVNGEIDSSSAF 98
411
412 ref_rec 99 -AKPFDID----EIRDAV 111
413 .|||.:| :||:.:
414 gi|94970045|r 99 LRKPFRMDALSAKIREVL 116
415
416
417 #=======================================
418 #
419 # Aligned_sequences: 2
420 # 1: ref_rec
421 # 2: gi|94970041|receiver
422 # Matrix: EBLOSUM62
423 # Gap_penalty: 10.0
424 # Extend_penalty: 0.5
425 #
426 # Length: 125
427 # Identity: 35/125 (28.0%)
428 # Similarity: 70/125 (56.0%)
429 # Gaps: 18/125 (14.4%)
430 # Score: 156.5
431 #
432 #
433 #=======================================
434
435 ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIV--TKERPDLVL 48
436 .:|:|:|:.|:|.|:..:.:::||...:|.:|.:||:|| :.::.|::|
437 gi|94970041|r 1 TVLLVEDEEGVRKLVRGILSRQGYHVLEATSGEEALEIVRESTQKIDMLL 50
438
439 ref_rec 49 LDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHF 98
440 .|:.:.||.|.|:.:|:::...:::||.|:.|.:..:::. |.||..
441 gi|94970041|r 51 SDVVLVGMSGRELSERLRIQMPSLKVIYMSGYTDDAIVRH----GVLTES 96
442
443 ref_rec 99 A----KPFDIDEIRDAV-------- 111
444 | |||..|.:...|
445 gi|94970041|r 97 AEFLQKPFTSDSLLRKVRAVLQKRQ 121
446
447
448 #---------------------------------------
449 #---------------------------------------
450
451 """
452
453
454 from StringIO import StringIO
455
456 alignments = list(EmbossIterator(StringIO(pair_example)))
457 assert len(alignments) == 1
458 assert len(alignments[0].get_all_seqs()) == 2
459 assert [r.id for r in alignments[0].get_all_seqs()] \
460 == ["IXI_234", "IXI_235"]
461
462 alignments = list(EmbossIterator(StringIO(simple_example)))
463 assert len(alignments) == 1
464 assert len(alignments[0].get_all_seqs()) == 4
465 assert [r.id for r in alignments[0].get_all_seqs()] \
466 == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"]
467
468 alignments = list(EmbossIterator(StringIO(pair_example + simple_example)))
469 assert len(alignments) == 2
470 assert len(alignments[0].get_all_seqs()) == 2
471 assert len(alignments[1].get_all_seqs()) == 4
472 assert [r.id for r in alignments[0].get_all_seqs()] \
473 == ["IXI_234", "IXI_235"]
474 assert [r.id for r in alignments[1].get_all_seqs()] \
475 == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"]
476
477
478
479
480
481
482
483 alignments = list(EmbossIterator(StringIO(pair_example2)))
484 assert len(alignments) == 5
485 assert len(alignments[0].get_all_seqs()) == 2
486 assert [r.id for r in alignments[0].get_all_seqs()] \
487 == ["ref_rec", "gi|94968718|receiver"]
488 assert [r.id for r in alignments[4].get_all_seqs()] \
489 == ["ref_rec", "gi|94970041|receiver"]
490
491 print "Done"
492