1
2
3
4
5
6 from Bio.Align.Generic import Alignment
7 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
8
10 """Stockholm/PFAM alignment writer."""
11
12
13
14 pfam_gr_mapping = {"secondary_structure" : "SS",
15 "surface_accessibility" : "SA",
16 "transmembrane" : "TM",
17 "posterior_probability" : "PP",
18 "ligand_binding" : "LI",
19 "active_site" : "AS",
20 "intron" : "IN"}
21
22 pfam_gs_mapping = {"organism" : "OS",
23 "organism_classification" : "OC",
24 "look" : "LO"}
25
27 """Use this to write (another) single alignment to an open file.
28
29 Note that sequences and their annotation are recorded
30 together (rather than having a block of annotation followed
31 by a block of aligned sequences).
32 """
33 records = alignment.get_all_seqs()
34 count = len(records)
35
36 self._length_of_sequences = alignment.get_alignment_length()
37 self._ids_written = []
38
39
40
41
42 if count == 0 :
43 raise ValueError("Must have at least one sequence")
44 if self._length_of_sequences == 0 :
45 raise ValueError("Non-empty sequences are required")
46
47 self.handle.write("# STOCKHOLM 1.0\n")
48 self.handle.write("#=GF SQ %i\n" % count)
49 for record in records :
50 self._write_record(record)
51 self.handle.write("//\n")
52
54 """Write a single SeqRecord to the file"""
55 if self._length_of_sequences <> len(record.seq) :
56 raise ValueError("Sequences must all be the same length")
57
58
59 seq_name = record.id
60 if record.name is not None :
61 if "accession" in record.annotations :
62 if record.id == record.annotations["accession"] :
63 seq_name = record.name
64
65
66 seq_name = seq_name.replace(" ","_")
67
68 if "start" in record.annotations \
69 and "end" in record.annotations :
70 suffix = "/%s-%s" % (str(record.annotations["start"]),
71 str(record.annotations["end"]))
72 if seq_name[-len(suffix):] <> suffix :
73 seq_name = "%s/%s-%s" % (seq_name,
74 str(record.annotations["start"]),
75 str(record.annotations["end"]))
76
77 if seq_name in self._ids_written :
78 raise ValueError("Duplicate record identifier: %s" % seq_name)
79 self._ids_written.append(seq_name)
80 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
81
82
83
84
85
86
87
88
89
90
91
92
93
94 if "accession" in record.annotations :
95 self.handle.write("#=GS %s AC %s\n" \
96 % (seq_name, self.clean(record.annotations["accession"])))
97 elif record.id :
98 self.handle.write("#=GS %s AC %s\n" \
99 % (seq_name, self.clean(record.id)))
100
101
102 if record.description :
103 self.handle.write("#=GS %s DE %s\n" \
104 % (seq_name, self.clean(record.description)))
105
106
107 for xref in record.dbxrefs :
108 self.handle.write("#=GS %s DR %s\n" \
109 % (seq_name, self.clean(xref)))
110
111
112 for key in record.annotations :
113 if key in self.pfam_gs_mapping :
114 self.handle.write("#=GS %s %s %s\n" \
115 % (seq_name,
116 self.clean(self.pfam_gs_mapping[key]),
117 self.clean(str(record.annotations[key]))))
118 elif key in self.pfam_gr_mapping :
119 if len(str(record.annotations[key]))==len(record.seq) :
120 self.handle.write("#=GR %s %s %s\n" \
121 % (seq_name,
122 self.clean(self.pfam_gr_mapping[key]),
123 self.clean((record.annotations[key]))))
124 else :
125
126 pass
127 else :
128
129
130 pass
131
133 """Loads a Stockholm file from PFAM into Alignment objects.
134
135 The file may contain multiple concatenated alignments, which are loaded
136 and returned incrementally.
137
138 This parser will detect if the Stockholm file follows the PFAM
139 conventions for sequence specific meta-data (lines starting #=GS
140 and #=GR) and populates the SeqRecord fields accordingly.
141
142 Any annotation which does not follow the PFAM conventions is currently
143 ignored.
144
145 If an accession is provided for an entry in the meta data, IT WILL NOT
146 be used as the record.id (it will be recorded in the record's
147 annotations). This is because some files have (sub) sequences from
148 different parts of the same accession (differentiated by different
149 start-end positions).
150
151 Wrap-around alignments are not supported - each sequences must be on
152 a single line. However, interlaced sequences should work.
153
154 For more information on the file format, please see:
155 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
156 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
157
158 For consistency with BioPerl and EMBOSS we call this the "stockholm"
159 format.
160 """
161
162
163
164 pfam_gr_mapping = {"SS" : "secondary_structure",
165 "SA" : "surface_accessibility",
166 "TM" : "transmembrane",
167 "PP" : "posterior_probability",
168 "LI" : "ligand_binding",
169 "AS" : "active_site",
170 "IN" : "intron"}
171
172 pfam_gs_mapping = {"OS" : "organism",
173 "OC" : "organism_classification",
174 "LO" : "look"}
175
177 try :
178 line = self._header
179 del self._header
180 except AttributeError :
181 line = self.handle.readline()
182 if not line:
183
184 return
185 if not line.strip() == '# STOCKHOLM 1.0':
186 raise ValueError("Did not find STOCKHOLM header")
187
188
189
190
191
192
193
194
195 seqs = {}
196 ids = []
197 gs = {}
198 gr = {}
199 gf = {}
200 passed_end_alignment = False
201 while 1:
202 line = self.handle.readline()
203 if not line: break
204 line = line.strip()
205 if line == '# STOCKHOLM 1.0':
206 self._header = line
207 break
208 elif line == "//" :
209
210
211 passed_end_alignment = True
212 elif line == "" :
213
214 pass
215 elif line[0] <> "#" :
216
217
218 assert not passed_end_alignment
219 parts = [x.strip() for x in line.split(" ",1)]
220 if len(parts) <> 2 :
221
222 raise ValueError("Could not split line into identifier " \
223 + "and sequence:\n" + line)
224 id, seq = parts
225 if id not in ids :
226 ids.append(id)
227 seqs.setdefault(id, '')
228 seqs[id] += seq.replace(".","-")
229 elif len(line) >= 5 :
230
231 if line[:5] == "#=GF " :
232
233
234 feature, text = line[5:].strip().split(None,1)
235
236
237 if feature not in gf :
238 gf[feature] = [text]
239 else :
240 gf[feature].append(text)
241 elif line[:5] == '#=GC ':
242
243
244 pass
245 elif line[:5] == '#=GS ':
246
247
248 id, feature, text = line[5:].strip().split(None,2)
249
250
251 if id not in gs :
252 gs[id] = {}
253 if feature not in gs[id] :
254 gs[id][feature] = [text]
255 else :
256 gs[id][feature].append(text)
257 elif line[:5] == "#=GR " :
258
259
260 id, feature, text = line[5:].strip().split(None,2)
261
262
263 if id not in gr :
264 gr[id] = {}
265 if feature not in gr[id]:
266 gr[id][feature] = ""
267 gr[id][feature] += text.strip()
268
269
270
271
272
273
274 assert len(seqs) <= len(ids)
275
276
277
278 self.ids = ids
279 self.sequences = seqs
280 self.seq_annotation = gs
281 self.seq_col_annotation = gr
282
283 if ids and seqs :
284
285 if self.records_per_alignment is not None \
286 and self.records_per_alignment <> len(ids) :
287 raise ValueError("Found %i records in this alignment, told to expect %i" \
288 % (len(ids), self.records_per_alignment))
289
290 alignment = Alignment(self.alphabet)
291
292
293
294 alignment._annotations = gr
295
296 alignment_length = len(seqs.values()[0])
297 for id in ids :
298 seq = seqs[id]
299 if alignment_length <> len(seq) :
300 raise ValueError("Sequences have different lengths, or repeated identifier")
301 name, start, end = self._identifier_split(id)
302 alignment.add_sequence(id, seq, start=start, end=end)
303
304 record = alignment.get_all_seqs()[-1]
305
306 assert record.id == id or record.description == id
307
308 record.id = id
309 record.name = name
310 record.description = id
311
312
313
314 record.annotations["accession"]=name
315
316 self._populate_meta_data(id, record)
317 return alignment
318 else :
319 return None
320
321
323 """Returns (name,start,end) string tuple from an identier."""
324 if identifier.find("/")<>-1 :
325 start_end = identifier.split("/",1)[1]
326 if start_end.count("-")==1 :
327 start, end = map(int, start_end.split("-"))
328 name = identifier.split("/",1)[0]
329 return (name, start, end)
330 return (identifier, None, None)
331
364
395
396 if __name__ == "__main__" :
397 print "Testing..."
398 from cStringIO import StringIO
399
400
401
402
403 sth_example = \
404 """# STOCKHOLM 1.0
405 #=GF ID CBS
406 #=GF AC PF00571
407 #=GF DE CBS domain
408 #=GF AU Bateman A
409 #=GF CC CBS domains are small intracellular modules mostly found
410 #=GF CC in 2 or four copies within a protein.
411 #=GF SQ 67
412 #=GS O31698/18-71 AC O31698
413 #=GS O83071/192-246 AC O83071
414 #=GS O83071/259-312 AC O83071
415 #=GS O31698/88-139 AC O31698
416 #=GS O31698/88-139 OS Bacillus subtilis
417 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
418 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
419 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
420 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
421 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
422 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
423 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
424 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
425 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
426 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
427 #=GR O31699/88-139 AS ________________*__________________________
428 #=GR_O31699/88-139_IN ____________1______________2__________0____
429 //
430 """
431
432
433
434 sth_example2 = \
435 """# STOCKHOLM 1.0
436 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
437 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
438 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
439 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
440 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
441
442 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
443 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
444 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
445 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
446 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
447 //"""
448
449 print "--------"
450 print "StockholmIterator(stockholm alignment file)"
451 iterator = StockholmIterator(StringIO(sth_example))
452 count=0
453 for alignment in iterator :
454 for record in alignment.get_all_seqs() :
455 count=count+1
456 assert count == 5
457
458
459 assert record.id == 'O31699/88-139'
460 assert record.name == 'O31699'
461 assert record.description == 'O31699/88-139'
462 assert len(record.annotations)==4+1
463 assert record.annotations["accession"]=='O31699'
464 assert record.annotations["start"]==88
465 assert record.annotations["end"]==139
466 assert record.annotations["active_site"]=='________________*__________________________'
467
468 iterator = StockholmIterator(StringIO(sth_example))
469 count=0
470 for alignment in iterator :
471 for record in alignment.get_all_seqs() :
472 count=count+1
473 break
474 break
475 assert count==1
476
477 assert record.id == 'O83071/192-246'
478 assert record.name == 'O83071'
479 assert record.description == 'O83071/192-246'
480 assert len(record.annotations)==4 + 1
481 assert record.annotations["accession"]=='O83071'
482 assert record.annotations["start"]==192
483 assert record.annotations["end"]==246
484 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777"
485
486 assert [[r.id for r in a.get_all_seqs()] \
487 for a in StockholmIterator(StringIO(sth_example))] \
488 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']]
489
490 assert [[r.name for r in a.get_all_seqs()] \
491 for a in StockholmIterator(StringIO(sth_example))] \
492 == [['O83071', 'O83071', 'O31698', 'O31698', 'O31699']]
493
494 assert [[r.description for r in a.get_all_seqs()] \
495 for a in StockholmIterator(StringIO(sth_example))] \
496 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']]
497
498
499 print "--------"
500 print "StockholmIterator(interlaced stockholm alignment file)"
501 iterator = iter(StockholmIterator(StringIO(sth_example2)).next().get_all_seqs())
502 record = iterator.next()
503 assert record.id == "AP001509.1"
504 assert len(record.seq) == 104
505 assert "secondary_structure" in record.annotations
506 assert len(record.annotations["secondary_structure"]) == 104
507 record = iterator.next()
508 assert record.id == "AE007476.1"
509 assert len(record.seq) == 104
510 assert "secondary_structure" in record.annotations
511 assert len(record.annotations["secondary_structure"]) == 104
512 try :
513 record = iterator.next()
514 except StopIteration :
515 record = None
516 assert record is None
517
518 print "--------"
519 print "StockholmIterator(concatenated stockholm alignment files)"
520 assert 2 == len(list(StockholmIterator(StringIO(sth_example + sth_example2))))
521 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n" + sth_example2))))
522 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n\n" + sth_example2))))
523 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n\n" + sth_example))))
524 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example))))
525
526 print "--------"
527 print "Checking write/read"
528 handle = StringIO()
529 list1 = list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example)))
530 StockholmWriter(handle).write_file(list1)
531 handle.seek(0)
532 list2 = list(StockholmIterator(handle))
533 assert len(list1) == len(list2)
534 for a1,a2 in zip(list1, list2) :
535 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs())
536 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()) :
537 assert r1.id == r2.id
538 assert r1.seq.tostring() == r2.seq.tostring()
539 print "Done"
540