1
2
3
4
5
6
7 from Bio.Align.Generic import Alignment
8 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
9
11 """Clustalw alignment writer."""
13 """Use this to write (another) single alignment to an open file."""
14
15 if len(alignment.get_all_seqs()) == 0 :
16 raise ValueError("Must have at least one sequence")
17
18
19 try :
20 version = self._version
21 except AttributeError :
22 version = '1.81'
23 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version
24
25 cur_char = 0
26 max_length = len(alignment._records[0].seq)
27
28 if max_length <= 0 :
29 raise ValueError("Non-empty sequences are required")
30
31
32 while cur_char != max_length:
33
34
35 if (cur_char + 50) > max_length:
36 show_num = max_length - cur_char
37 else:
38 show_num = 50
39
40
41
42
43 for record in alignment._records:
44
45
46
47 line = record.id[0:30].replace(" ","_").ljust(36)
48 line += record.seq.data[cur_char:(cur_char + show_num)]
49 output += line + "\n"
50
51
52
53 if hasattr(alignment, "_star_info") and alignment._star_info != '':
54 output += (" " * 36) + \
55 alignment._star_info[cur_char:(cur_char + show_num)] + "\n"
56
57 output += "\n"
58 cur_char += show_num
59
60
61 self.handle.write(output + "\n")
62
64 """Clustalw alignment iterator."""
65
67
68 handle = self.handle
69
70 try :
71
72
73 line = self._header
74 del self._header
75 except AttributeError:
76 line = handle.readline()
77 if not line:
78 return None
79 if line[:7] <> 'CLUSTAL':
80 raise ValueError("Did not find CLUSTAL header")
81
82
83
84 version = None
85 for word in line.split() :
86 if word[0]=='(' and word[-1]==')':
87 word = word[1:-1]
88 if word[0] in '0123456789':
89 version = word
90
91
92
93 line = handle.readline()
94 while line.strip() == "" :
95 line = handle.readline()
96
97
98
99
100 ids = []
101 seqs = []
102 consensus = ""
103 seq_cols = None
104
105
106 while line.strip() <> "" :
107 if line[0] <> " " :
108
109 fields = line.rstrip().split()
110
111
112
113 if len(fields) < 2 or len(fields) > 3:
114 raise ValueError("Could not parse line:\n%s" % line)
115
116 ids.append(fields[0])
117 seqs.append(fields[1])
118
119
120 if seq_cols is None :
121 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
122 end = start + len(fields[1])
123 seq_cols = slice(start, end)
124 del start, end
125 assert fields[1] == line[seq_cols]
126
127 if len(fields) == 3 :
128
129 try :
130 letters = int(fields[2])
131 except ValueError :
132 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
133 if len(fields[1].replace("-","")) <> letters :
134 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
135 else :
136
137 assert seq_cols is not None
138 consensus = line[seq_cols]
139 assert not line[seq_cols.stop:].strip()
140 line = handle.readline()
141 if not line : break
142
143 assert line.strip() == ""
144 assert seq_cols is not None
145
146
147 done = False
148 while not done :
149
150
151
152 while (not line) or line.strip() == "" :
153 line = handle.readline()
154 if not line : break
155 if not line : break
156
157 if line[:7] == 'CLUSTAL':
158
159 done = True
160 self._header = line
161 break
162
163 for i in range(len(ids)) :
164 fields = line.rstrip().split()
165
166
167
168 if len(fields) < 2 or len(fields) > 3:
169 raise ValueError("Could not parse line:\n%s" % line)
170
171 if fields[0] <> ids[i] :
172 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
173 % (fields[0], ids[i]))
174
175 if fields[1] <> line[seq_cols] :
176 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
177 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
178 end = start + len(fields[1])
179 seq_cols = slice(start, end)
180 del start, end
181
182
183 seqs[i] += fields[1]
184
185 if len(fields) == 3 :
186
187 try :
188 letters = int(fields[2])
189 except ValueError :
190 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
191 if len(seqs[i].replace("-","")) <> letters :
192 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
193
194
195 line = handle.readline()
196
197 if consensus :
198 assert seq_cols is not None
199 consensus += line[seq_cols]
200 assert not line[seq_cols.stop:].strip()
201
202 line = handle.readline()
203
204
205 assert len(ids) == len(seqs)
206 if len(seqs) == 0 or len(seqs[0]) == 0 :
207 return None
208
209 if self.records_per_alignment is not None \
210 and self.records_per_alignment <> len(ids) :
211 raise ValueError("Found %i records in this alignment, told to expect %i" \
212 % (len(ids), self.records_per_alignment))
213
214 alignment = Alignment(self.alphabet)
215 alignment_length = len(seqs[0])
216 for i in range(len(ids)) :
217 if len(seqs[i]) <> alignment_length:
218 raise ValueError("Error parsing alignment - sequences of different length?")
219 alignment.add_sequence(ids[i], seqs[i])
220
221
222 if version :
223 alignment._version = version
224 if consensus :
225 assert len(consensus) == alignment_length, \
226 "Alignment length is %i, consensus length is %i, '%s'" \
227 % (alignment_length, len(consensus), consensus)
228 alignment._star_info = consensus
229 return alignment
230
231 if __name__ == "__main__" :
232 print "Running a quick self-test"
233
234
235
236 aln_example1 = \
237 """CLUSTAL W (1.81) multiple sequence alignment
238
239
240 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
241 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
242 * *: :: :. :* : :. : . :* :: .
243
244 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
245 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
246 : ** **:... *.*** ..
247
248 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
249 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
250 .:* * *: .* :* : :* .*
251
252 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
253 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
254 *::. . .:: :*..* :* .* .. . : . :
255
256 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
257 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
258 *. .:: : .
259
260 """
261
262
263
264
265 aln_example2 = \
266 """CLUSTAL X (1.83) multiple sequence alignment
267
268
269 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
270 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
271 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
272 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
273 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
274 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
275 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
276 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
277 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
278 : . : :.
279
280 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
281 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
282 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
283 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
284 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
285 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
286 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
287 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
288 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
289 ** .: *::::. : :. . ..:
290
291 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
292 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
293 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
294 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
295 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
296 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
297 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
298 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
299 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
300 *.: . * . * *: :
301
302 """
303
304 from StringIO import StringIO
305
306 alignments = list(ClustalIterator(StringIO(aln_example1)))
307 assert 1 == len(alignments)
308 assert alignments[0]._version == "1.81"
309 records = alignments[0].get_all_seqs()
310 assert 2 == len(records)
311 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069"
312 assert records[1].id == "gi|671626|emb|CAA85685.1|"
313 assert records[0].seq.tostring() == \
314 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
315 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
316 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
317 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
318 "VPTTRAQRRA"
319
320 alignments = list(ClustalIterator(StringIO(aln_example2)))
321 assert 1 == len(alignments)
322 assert alignments[0]._version == "1.83"
323 records = alignments[0].get_all_seqs()
324 assert 9 == len(records)
325 assert records[-1].id == "HISJ_E_COLI"
326 assert records[-1].seq.tostring() == \
327 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
328 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
329 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
330
331 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)) :
332 print "Alignment with %i records of length %i" \
333 % (len(alignment.get_all_seqs()),
334 alignment.get_alignment_length())
335
336 print "Checking empty file..."
337 assert 0 == len(list(ClustalIterator(StringIO(""))))
338
339 print "Checking write/read..."
340 alignments = list(ClustalIterator(StringIO(aln_example1))) \
341 + list(ClustalIterator(StringIO(aln_example2)))*2
342 handle = StringIO()
343 ClustalWriter(handle).write_file(alignments)
344 handle.seek(0)
345 for i,a in enumerate(ClustalIterator(handle)) :
346 assert a.get_alignment_length() == alignments[i].get_alignment_length()
347 handle.seek(0)
348
349 print "Testing write/read when there is only one sequence..."
350 alignment._records = alignment._records[0:1]
351 handle = StringIO()
352 ClustalWriter(handle).write_file([alignment])
353 handle.seek(0)
354 for i,a in enumerate(ClustalIterator(handle)) :
355 assert a.get_alignment_length() == alignment.get_alignment_length()
356 assert len(a.get_all_seqs()) == 1
357
358
359 print "The End"
360