1
2
3
4
5
6
7 """Bio.SeqIO support for the "clustal" (aka ClustalW) file format.
8
9 You were expected to use this module via the Bio.SeqIO functions.
10 This module has now been replaced by Bio.AlignIO.ClustalIO, and is
11 deprecated."""
12
13 import warnings
14 warnings.warn("Bio.SeqIO.ClustalIO is deprecated. You can continue to read" \
15 + " and write 'clustal' files with Bio.SeqIO, but this is now" \
16 + " handled via Bio.AlignIO internally.",
17 DeprecationWarning)
18
19
20 from Bio.Alphabet import single_letter_alphabet
21 from Bio.Seq import Seq
22 from Bio.SeqRecord import SeqRecord
23
24
25 from Bio.SeqIO.Interfaces import SequenceWriter
26 from Bio.Clustalw import ClustalAlignment
27
28
29
31 """Reads a Clustalw file returning a SeqRecord object iterator.
32
33 The entire file is loaded at once, but the SeqRecord objects
34 are only created "on request".
35
36 For more information on the file format, please see:
37 http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format
38
39 You might like to look at Bio.Clustalw which has an interface
40 to the command line tool clustalw, and can also clustal alignment
41 files into Bio.Clustalw.ClustalAlignment objects.
42
43 We call this the "clustal" format which is consistent with EMBOSS.
44 Sadly BioPerl calls it the "clustalw" format, so we can't match
45 them both.
46 """
47 line = handle.readline()
48 if not line: return
49 if not line[:7] == 'CLUSTAL':
50 raise ValueError("Did not find CLUSTAL header")
51
52
53 line = handle.readline()
54 while line.strip() == "" :
55 line = handle.readline()
56
57
58
59
60 ids = []
61 seqs = []
62
63
64 while line.strip() <> "" :
65 if line[0] <> " " :
66
67 fields = line.rstrip().split()
68
69
70
71 if len(fields) < 2 or len(fields) > 3:
72 raise ValueError("Could not parse line:\n%s" % line)
73
74 ids.append(fields[0])
75 seqs.append(fields[1])
76
77 if len(fields) == 3 :
78
79 try :
80 letters = int(fields[2])
81 except ValueError :
82 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
83 if len(fields[1].replace("-","")) <> letters :
84 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
85 else :
86
87 pass
88 line = handle.readline()
89 if not line : break
90
91 assert line.strip() == ""
92
93
94 while True :
95
96
97
98 while (not line) or line.strip() == "" or line[0]==" ":
99 line = handle.readline()
100 if not line : break
101 if not line : break
102
103 for i in range(len(ids)) :
104 fields = line.rstrip().split()
105
106
107
108 if len(fields) < 2 or len(fields) > 3:
109 raise ValueError("Could not parse line:\n%s" % line)
110
111 if fields[0] <> ids[i] :
112 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
113 % (fields[0], ids[i]))
114
115
116 seqs[i] += fields[1]
117
118 if len(fields) == 3 :
119
120 try :
121 letters = int(fields[2])
122 except ValueError :
123 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
124 if len(seqs[i].replace("-","")) <> letters :
125 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
126
127
128 line = handle.readline()
129
130 assert len(ids) == len(seqs)
131 alignment_length = len(seqs[0])
132 for i in range(len(ids)) :
133 if len(seqs[i]) <> alignment_length:
134 raise ValueError("Error parsing alignment - sequences of different length?")
135 yield SeqRecord(Seq(seqs[i], alphabet), id=ids[i])
136
138 """Write Clustal sequence alignments."""
140 """Creates the writer object.
141
142 Use the method write_file() to actually record your sequence records."""
143 self.handle = handle
144
146 """Use this to write an entire file containing the given records.
147
148 records - a SeqRecord iterator, or list of SeqRecords
149
150 Right now this code uses Bio.Clustalw.ClustalAlignment to do
151 the hard work - this may change in the future.
152 """
153
154
155
156
157
158
159
160
161
162 length_of_sequences = None
163 alignment = ClustalAlignment()
164 for record in records :
165 if length_of_sequences is None :
166 length_of_sequences = len(record.seq)
167 elif length_of_sequences <> len(record.seq) :
168 raise ValueError("Sequences must all be the same length")
169
170 if length_of_sequences <= 0 :
171 raise ValueError("Non-empty sequences are required")
172
173
174
175
176
177
178
179
180
181
182
183
184
185 alignment.add_sequence(record.id.replace(" ","_"),
186 record.seq.tostring())
187
188 if len(alignment.get_all_seqs()) == 0 :
189 raise ValueError("Must have at least one sequence")
190
191 self.handle.write(str(alignment))
192
193
194
195
196
197
198 if __name__ == "__main__" :
199
200
201
202
203 aln_example1 = \
204 """CLUSTAL W (1.81) multiple sequence alignment
205
206
207 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
208 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
209 * *: :: :. :* : :. : . :* :: .
210
211 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
212 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
213 : ** **:... *.*** ..
214
215 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
216 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
217 .:* * *: .* :* : :* .*
218
219 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
220 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
221 *::. . .:: :*..* :* .* .. . : . :
222
223 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
224 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
225 *. .:: : .
226
227 """
228
229
230
231
232 aln_example2 = \
233 """CLUSTAL X (1.83) multiple sequence alignment
234
235
236 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
237 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
238 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
239 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
240 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
241 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
242 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
243 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
244 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
245 : . : :.
246
247 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
248 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
249 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
250 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
251 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
252 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
253 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
254 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
255 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
256 ** .: *::::. : :. . ..:
257
258 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
259 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
260 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
261 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
262 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
263 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
264 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
265 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
266 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
267 *.: . * . * *: :
268
269 """
270
271 from StringIO import StringIO
272
273 records = list(ClustalIterator(StringIO(aln_example1)))
274 assert 2 == len(records)
275 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069"
276 assert records[1].id == "gi|671626|emb|CAA85685.1|"
277 assert records[0].seq.tostring() == \
278 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
279 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
280 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
281 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
282 "VPTTRAQRRA"
283
284 records = list(ClustalIterator(StringIO(aln_example2)))
285 assert 9 == len(records)
286 assert records[-1].id == "HISJ_E_COLI"
287 assert records[-1].seq.tostring() == \
288 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
289 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
290 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
291