1
2
3
4
5
6 """Alignment input/output designed to look similar to Bio.SeqIO.
7
8 Input
9 =====
10 For the typical special case when your file or handle contains one and only
11 one alignment, use the function Bio.AlignIO.read(). This takes an input file
12 handle, format string and optional number of sequences per alignment. It will
13 return a single Alignment object (or raise an exception if there isn't just
14 one alignment):
15
16 from Bio import AlignIO
17 handle = open("example.aln", "rU")
18 align = AlignIO.read(handle, "clustal")
19 handle.close()
20 print align
21
22 For the general case, when the handle could contain any number of alignments,
23 use the function Bio.AlignIO.parse(...) which takes the same arguments, but
24 returns an iterator giving Alignment objects. For example, using the output
25 from the EMBOSS water or needle pairwise alignment prorams:
26
27 from Bio import AlignIO
28 handle = open("example.txt", "rU")
29 for alignment in AlignIO.parse(handle, "emboss") :
30 print alignment
31
32 If you want random access to the alignments by number, turn this into a list:
33
34 from Bio import AlignIO
35 handle = open("example.aln", "rU")
36 alignments = list(AlignIO.parse(handle, "clustal"))
37 print alignments[0]
38
39 Most alignment file formats can be concatenated so as to hold as many
40 different multiple sequence alignments as possible. One common example
41 is the output of the tool seqboot in the PHLYIP suite. Sometimes there
42 can be a file header and footer, as seen in the EMBOSS alignment output.
43
44 There is an optional argument for the number of sequences per alignment which
45 is usually only needed with the alignments stored in the FASTA format.
46 Without this information, there is no clear way to tell if you have say a
47 single alignment of 20 sequences, or four alignments of 5 sequences. e.g.
48
49 from Bio import AlignIO
50 handle = open("example.faa", "rU")
51 for alignment in AlignIO.parse(handle, "fasta", seq_count=5) :
52 print alignment
53
54 The above code would split up the FASTA files, and try and batch every five
55 sequences into an alignment.
56
57 Output
58 ======
59 Use the function Bio.AlignIO.write(...), which takes a complete set of
60 Alignment objects (either as a list, or an iterator), an output file handle
61 and of course the file format.
62
63 from Bio import AlignIO
64 alignments = ...
65 handle = open("example.faa", "w")
66 alignment = SeqIO.write(alignments, handle, "fasta")
67 handle.close()
68
69 In general, you are expected to call this function once (with all your
70 alignments) and then close the file handle. However, for file formats
71 like PHYLIP where multiple alignments are stored sequentially (with no file
72 header and footer), then multiple calls to the write function should work as
73 expected.
74
75 File Formats
76 ============
77 When specifying the file format, use lowercase strings. The same format
78 names are also used in Bio.SeqIO and include the following:
79
80 clustal - Ouput from Clustal W or X, see also the module Bio.Clustalw
81 which can be used to run the command line tool from Biopython.
82 emboss - The "pairs" and "simple" alignment format from the EMBOSS tools.
83 fasta - The generic sequence file format where each record starts with a
84 identifer line starting with a ">" character, followed by lines
85 of sequence.
86 fasta-m10 - For the pairswise alignments output by Bill Pearson's FASTA
87 tools when used with the -m 10 command line option for machine
88 readable output.
89 nexus - Output from NEXUS, see also the module Bio.Nexus which can also
90 read any phylogenetic trees in these files.
91 phylip - Used by the PHLIP tools.
92 stockholm - A richly annotated alignment file format used by PFAM.
93
94 Further Information
95 ===================
96 See the wiki page biopython.org/wiki/AlignIO and also the Bio.AlignIO chapter
97 in the Biopython Tutorial and Cookbook which is also available online:
98
99 http://biopython.org/DIST/docs/tutorial/Tutorial.html
100 http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
101 """
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 import os
118
119 from StringIO import StringIO
120 from Bio.Alphabet import generic_alphabet, generic_protein
121 from Bio.Seq import Seq
122 from Bio.SeqRecord import SeqRecord
123 from Bio.Align.Generic import Alignment
124
125 import StockholmIO
126 import ClustalIO
127 import NexusIO
128 import PhylipIO
129 import EmbossIO
130 import FastaIO
131
132
133
134
135 _FormatToIterator ={
136 "clustal" : ClustalIO.ClustalIterator,
137 "emboss" : EmbossIO.EmbossIterator,
138 "fasta-m10" : FastaIO.FastaM10Iterator,
139 "nexus" : NexusIO.NexusIterator,
140 "phylip" : PhylipIO.PhylipIterator,
141 "stockholm" : StockholmIO.StockholmIterator,
142 }
143
144 _FormatToWriter ={
145
146 "phylip" : PhylipIO.PhylipWriter,
147 "stockholm" : StockholmIO.StockholmWriter,
148 "clustal" : ClustalIO.ClustalWriter,
149 }
150
151 -def write(alignments, handle, format) :
152 """Write complete set of alignments to a file.
153
154 sequences - A list (or iterator) of Alignment objects
155 handle - File handle object to write to
156 format - What format to use.
157
158 You should close the handle after calling this function.
159
160 There is no return value.
161 """
162 from Bio import SeqIO
163
164
165 if isinstance(handle, basestring) :
166 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
167 if not isinstance(format, basestring) :
168 raise TypeError("Need a string for the file format (lower case)")
169 if not format :
170 raise ValueError("Format required (lower case string)")
171 if format <> format.lower() :
172 raise ValueError("Format string '%s' should be lower case" % format)
173 if isinstance(alignments, Alignment) :
174 raise TypeError("Need a list of alignments, not simply an Alignment")
175
176
177 if format in _FormatToIterator :
178 writer_class = _FormatToWriter[format]
179 writer_class(handle).write_file(alignments)
180 elif format in SeqIO._FormatToIterator :
181
182
183 for alignment in alignments :
184 SeqIO.write(alignment.get_all_seqs(), format)
185 else :
186 raise ValueError("Unknown format '%s'" % format)
187
188 return
189
190
192 """Private function, uses Bio.SeqIO to create an Alignment iterator.
193
194 handle - handle to the file.
195 format - string describing the file format.
196 seq_count- Optional integer, number of sequences expected in
197 each alignment. Recommended for fasta format files.
198
199 If count is omitted (default) then all the sequences in
200 the file are combined into a single Alignment.
201 """
202 from Bio import SeqIO
203
204 assert format in SeqIO._FormatToIterator
205
206 if seq_count :
207
208 seq_record_iterator = SeqIO.parse(handle, format)
209
210 records = []
211 for record in seq_record_iterator :
212 records.append(record)
213 if len(records) == seq_count :
214 yield SeqIO.to_alignment(records)
215 records = []
216 if len(records) > 0 :
217 raise ValueError("Check seq_count argument, not enough sequences?")
218 else :
219
220
221 records = list(SeqIO.parse(handle, format))
222 if records :
223 yield SeqIO.to_alignment(records)
224 else :
225
226 pass
227
228 -def parse(handle, format, seq_count=None) :
229 """Turns a sequence file into an iterator returning Alignment objects.
230
231 handle - handle to the file.
232 format - string describing the file format.
233 seq_count- Optional integer, number of sequences expected in
234 each alignment. Recommended for fasta format files.
235
236 If you have the file name in a string 'filename', use:
237
238 from Bio import AlignIO
239 my_iterator = AlignIO.parse(open(filename,"rU"), format)
240
241 If you have a string 'data' containing the file contents, use:
242
243 from Bio import AlignIO
244 from StringIO import StringIO
245 my_iterator = AlignIO.parse(StringIO(data), format)
246
247 Use the Bio.AlignIO.read(handle, format[, seq_count]) function when
248 you expect a single record only.
249 """
250 from Bio import SeqIO
251
252
253 if isinstance(handle, basestring) :
254 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
255 if not isinstance(format, basestring) :
256 raise TypeError("Need a string for the file format (lower case)")
257 if not format :
258 raise ValueError("Format required (lower case string)")
259 if format <> format.lower() :
260 raise ValueError("Format string '%s' should be lower case" % format)
261
262
263 if format in _FormatToIterator :
264 iterator_generator = _FormatToIterator[format]
265 return iterator_generator(handle, seq_count)
266 elif format in SeqIO._FormatToIterator :
267
268 return _SeqIO_to_alignment_iterator(handle, format, seq_count)
269 else :
270 raise ValueError("Unknown format '%s'" % format)
271
272 -def read(handle, format, seq_count=None) :
273 """Turns an alignment file into a single Alignment object.
274
275 handle - handle to the file.
276 format - string describing the file format.
277 seq_count- Optional interger, number of sequences expected in
278 the alignment to check you got what you expected.
279
280 If the handle contains no alignments, or more than one alignment,
281 an exception is raised. For example, using a PFAM/Stockholm file
282 containing one alignment:
283
284 from Bio import AlignIO
285 align = AlignIO.read(open("example.sth"), "stockholm")
286
287 If however you want the first alignment from a file containing
288 multiple alignments this function would raise an exception.
289 Instead use:
290
291 from Bio import AlignIO
292 align = AlignIO.parse(open("example.sth"), "stockholm").next()
293
294 Use the Bio.AlignIO.parse() function if you want to read multiple
295 records from the handle.
296 """
297 iterator = parse(handle, format, seq_count)
298 try :
299 first = iterator.next()
300 except StopIteration :
301 first = None
302 if first is None :
303 raise ValueError, "No records found in handle"
304 try :
305 second = iterator.next()
306 except StopIteration :
307 second = None
308 if second is not None :
309 raise ValueError, "More than one record found in handle"
310 if seq_count :
311 assert len(first.get_all_seqs())==seq_count
312 return first
313
314
315 if __name__ == "__main__" :
316
317 from Bio.Alphabet import generic_nucleotide
318 from sets import Set
319
320 for format in _FormatToIterator :
321 print "parse(handle to empty file)"
322 iterator = parse(StringIO(""), format=format)
323 assert len(list(iterator))==0
324 iterator = parse(StringIO(""), format=format, seq_count = 42)
325 assert len(list(iterator))==0
326 print
327
339
340
341
342 faa_example = \
343 """>V_Harveyi_PATH
344 mknwikvava aialsaatvq aatevkvgms gryfpftfvk qdklqgfevd mwdeigkrnd
345 ykieyvtanf sglfglletg ridtisnqit mtdarkakyl fadpyvvdga qitvrkgnds
346 iqgvedlagk tvavnlgsnf eqllrdydkd gkiniktydt giehdvalgr adafimdrls
347 alelikktgl plqlagepfe tiqnawpfvd nekgrklqae vnkalaemra dgtvekisvk
348 wfgaditk
349 >B_subtilis_YXEM
350 mkmkkwtvlv vaallavlsa cgngnssske ddnvlhvgat gqsypfayke ngkltgfdve
351 vmeavakkid mkldwkllef sglmgelqtg kldtisnqva vtderketyn ftkpyayagt
352 qivvkkdntd iksvddlkgk tvaavlgsnh aknleskdpd kkiniktyet qegtlkdvay
353 grvdayvnsr tvliaqikkt glplklagdp ivyeqvafpf akddahdklr kkvnkaldel
354 rkdgtlkkls ekyfneditv eqkh
355 >FLIY_ECOLI
356 mklahlgrqa lmgvmavalv agmsvksfad egllnkvker gtllvglegt yppfsfqgdd
357 gkltgfevef aqqlakhlgv easlkptkwd gmlasldskr idvvinqvti sderkkkydf
358 stpytisgiq alvkkgnegt iktaddlkgk kvgvglgtny eewlrqnvqg vdvrtydddp
359 tkyqdlrvgr idailvdrla aldlvkktnd tlavtgeafs rqesgvalrk gnedllkavn
360 daiaemqkdg tlqalsekwf gadvtk
361 >Deinococcus_radiodurans
362 mkksllslkl sgllvpsvla lslsacssps stlnqgtlki amegtyppft skneqgelvg
363 fdvdiakava qklnlkpefv ltewsgilag lqankydviv nqvgitperq nsigfsqpya
364 ysrpeiivak nntfnpqsla dlkgkrvgst lgsnyekqli dtgdikivty pgapeiladl
365 vagridaayn drlvvnyiin dqklpvrgag qigdaapvgi alkkgnsalk dqidkaltem
366 rsdgtfekis qkwfgqdvgq p
367 >B_subtilis_GlnH_homo_YCKK
368 mkkallalfm vvsiaalaac gagndnqskd nakdgdlwas ikkkgvltvg tegtyepfty
369 hdkdtdkltg ydveviteva krlglkvdfk etqwgsmfag lnskrfdvva nqvgktdred
370 kydfsdkytt sravvvtkkd nndikseadv kgktsaqslt snynklatna gakvegvegm
371 aqalqmiqqa rvdmtyndkl avlnylktsg nknvkiafet gepqstyftf rkgsgevvdq
372 vnkalkemke dgtlskiskk wfgedvsk
373 >YA80_HAEIN
374 mkkllfttal ltgaiafstf shageiadrv ektktllvgt egtyapftfh dksgkltgfd
375 vevirkvaek lglkvefket qwdamyagln akrfdvianq tnpsperlkk ysfttpynys
376 ggvivtkssd nsiksfedlk grksaqsats nwgkdakaag aqilvvdgla qslelikqgr
377 aeatindkla vldyfkqhpn sglkiaydrg dktptafafl qgedalitkf nqvlealrqd
378 gtlkqisiew fgyditq
379 >E_coli_GlnH
380 mksvlkvsla altlafavss haadkklvva tdtafvpfef kqgdkyvgfd vdlwaaiake
381 lkldyelkpm dfsgiipalq tknvdlalag ititderkka idfsdgyyks gllvmvkann
382 ndvksvkdld gkvvavksgt gsvdyakani ktkdlrqfpn idnaymelgt nradavlhdt
383 pnilyfikta gngqfkavgd sleaqqygia fpkgsdelrd kvngalktlr engtyneiyk
384 kwfgtepk
385 >HISJ_E_COLI
386 mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg fdidlakelc
387 krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly aadsrlvvak
388 nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys dltagridaa
389 fqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre alnkafaemr
390 adgtyeklak kyfdfdvygg"""
391
392
393 aln_example = \
394 """CLUSTAL X (1.83) multiple sequence alignment
395
396
397 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
398 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
399 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
400 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
401 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
402 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
403 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
404 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
405 : . : :.
406
407 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
408 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
409 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
410 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
411 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
412 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
413 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
414 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
415 ** .: *::::. : :. . ..:
416
417 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQITVRKGNDSIQGVE
418 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQIVVKKDNTDIKSVD
419 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVVVTKKDNNDIKSEA
420 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVIVTKSSDNSIKSFE
421 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKGNEGTIKTAD
422 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLVMVKANNNDVKSVK
423 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKNNTFNPQSLA
424 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQP-TVE
425 *.: . * . * *: : : .
426
427 V_Harveyi_PATH DLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDT--GIEHDVALGRADA
428 B_subtilis_YXEM DLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETQEGTLKDVAYGRVDA
429 B_subtilis_GlnH_homo_YCKK DVKGKTSAQSLTSNYNKLATN----AGAKVEGVEGMAQALQMIQQARVDM
430 YA80_HAEIN DLKGRKSAQSATSNWGKDAKA----AGAQILVVDGLAQSLELIKQGRAEA
431 FLIY_ECOLI DLKGKKVGVGLGTNYEEWLRQNV--QGVDVRTYDDDPTKYQDLRVGRIDA
432 E_coli_GlnH DLDGKVVAVKSGTGSVDYAKAN--IKTKDLRQFPNIDNAYMELGTNRADA
433 Deinococcus_radiodurans DLKGKRVGSTLGSNYEKQLIDTG---DIKIVTYPGAPEILADLVAGRIDA
434 HISJ_E_COLI SLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDA
435 .: *: . : .: : * :
436
437 V_Harveyi_PATH FIMDRLSALE-LIKKT-GLPLQLAGEPFETI-----QNAWPFVDNEKGRK
438 B_subtilis_YXEM YVNSRTVLIA-QIKKT-GLPLKLAGDPIVYE-----QVAFPFAKDDAHDK
439 B_subtilis_GlnH_homo_YCKK TYNDKLAVLN-YLKTSGNKNVKIAFETGEPQ-----STYFTFRKGS--GE
440 YA80_HAEIN TINDKLAVLD-YFKQHPNSGLKIAYDRGDKT-----PTAFAFLQGE--DA
441 FLIY_ECOLI ILVDRLAALD-LVKKT-NDTLAVTGEAFSRQ-----ESGVALRKGN--ED
442 E_coli_GlnH VLHDTPNILY-FIKTAGNGQFKAVGDSLEAQ-----QYGIAFPKGS--DE
443 Deinococcus_radiodurans AYNDRLVVNY-IINDQ-KLPVRGAGQIGDAA-----PVGIALKKGN--SA
444 HISJ_E_COLI AFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKED--NE
445 . .: : . .
446
447 V_Harveyi_PATH LQAEVNKALAEMRADGTVEKISVKWFGADITK----
448 B_subtilis_YXEM LRKKVNKALDELRKDGTLKKLSEKYFNEDITVEQKH
449 B_subtilis_GlnH_homo_YCKK VVDQVNKALKEMKEDGTLSKISKKWFGEDVSK----
450 YA80_HAEIN LITKFNQVLEALRQDGTLKQISIEWFGYDITQ----
451 FLIY_ECOLI LLKAVNDAIAEMQKDGTLQALSEKWFGADVTK----
452 E_coli_GlnH LRDKVNGALKTLRENGTYNEIYKKWFGTEPK-----
453 Deinococcus_radiodurans LKDQIDKALTEMRSDGTFEKISQKWFGQDVGQP---
454 HISJ_E_COLI LREALNKAFAEMRADGTYEKLAKKYFDFDVYGG---
455 : .: .: :: :** . : ::*. :
456 """
457
458
459
460
461
462 phy_example = \
463 """ 8 286
464 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
465 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
466 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
467 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
468 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
469 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
470 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
471 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
472
473 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
474 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
475 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
476 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
477 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
478 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
479 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
480 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
481
482 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
483 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
484 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
485 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
486 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
487 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
488 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
489 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
490
491 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
492 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
493 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
494 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
495 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
496 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
497 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
498 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
499
500 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
501 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
502 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
503 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
504 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
505 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
506 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
507 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
508
509 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
510 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
511 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
512 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
513 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
514 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
515 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
516 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
517 """
518
519 nxs_example = \
520 """#NEXUS
521 BEGIN DATA;
522 dimensions ntax=8 nchar=286;
523 format missing=?
524 symbols="ABCDEFGHIKLMNPQRSTUVWXYZ"
525 interleave datatype=PROTEIN gap= -;
526
527 matrix
528 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
529 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
530 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
531 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
532 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
533 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
534 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
535 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
536
537 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
538 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
539 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
540 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
541 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
542 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
543 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
544 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
545
546 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQITVRKGNDSIQGVE
547 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQIVVKKDNTDIKSVD
548 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVVVTKKDNNDIKSEA
549 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVIVTKSSDNSIKSFE
550 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKGNEGTIKTAD
551 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLVMVKANNNDVKSVK
552 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKNNTFNPQSLA
553 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQP-TVE
554
555 V_Harveyi_PATH DLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDT--GIEHDVALGRADA
556 B_subtilis_YXEM DLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETQEGTLKDVAYGRVDA
557 B_subtilis_GlnH_homo_YCKK DVKGKTSAQSLTSNYNKLATN----AGAKVEGVEGMAQALQMIQQARVDM
558 YA80_HAEIN DLKGRKSAQSATSNWGKDAKA----AGAQILVVDGLAQSLELIKQGRAEA
559 FLIY_ECOLI DLKGKKVGVGLGTNYEEWLRQNV--QGVDVRTYDDDPTKYQDLRVGRIDA
560 E_coli_GlnH DLDGKVVAVKSGTGSVDYAKAN--IKTKDLRQFPNIDNAYMELGTNRADA
561 Deinococcus_radiodurans DLKGKRVGSTLGSNYEKQLIDTG---DIKIVTYPGAPEILADLVAGRIDA
562 HISJ_E_COLI SLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDA
563
564 V_Harveyi_PATH FIMDRLSALE-LIKKT-GLPLQLAGEPFETI-----QNAWPFVDNEKGRK
565 B_subtilis_YXEM YVNSRTVLIA-QIKKT-GLPLKLAGDPIVYE-----QVAFPFAKDDAHDK
566 B_subtilis_GlnH_homo_YCKK TYNDKLAVLN-YLKTSGNKNVKIAFETGEPQ-----STYFTFRKGS--GE
567 YA80_HAEIN TINDKLAVLD-YFKQHPNSGLKIAYDRGDKT-----PTAFAFLQGE--DA
568 FLIY_ECOLI ILVDRLAALD-LVKKT-NDTLAVTGEAFSRQ-----ESGVALRKGN--ED
569 E_coli_GlnH VLHDTPNILY-FIKTAGNGQFKAVGDSLEAQ-----QYGIAFPKGS--DE
570 Deinococcus_radiodurans AYNDRLVVNY-IINDQ-KLPVRGAGQIGDAA-----PVGIALKKGN--SA
571 HISJ_E_COLI AFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKED--NE
572
573 V_Harveyi_PATH LQAEVNKALAEMRADGTVEKISVKWFGADITK----
574 B_subtilis_YXEM LRKKVNKALDELRKDGTLKKLSEKYFNEDITVEQKH
575 B_subtilis_GlnH_homo_YCKK VVDQVNKALKEMKEDGTLSKISKKWFGEDVSK----
576 YA80_HAEIN LITKFNQVLEALRQDGTLKQISIEWFGYDITQ----
577 FLIY_ECOLI LLKAVNDAIAEMQKDGTLQALSEKWFGADVTK----
578 E_coli_GlnH LRDKVNGALKTLRENGTYNEIYKKWFGTEPK-----
579 Deinococcus_radiodurans LKDQIDKALTEMRSDGTFEKISQKWFGQDVGQP---
580 HISJ_E_COLI LREALNKAFAEMRADGTYEKLAKKYFDFDVYGG---
581 ;
582 end;
583 """
584
585
586
587 nxs_example2 = \
588 """#NEXUS
589
590 Begin data;
591 Dimensions ntax=10 nchar=705;
592 Format datatype=dna interleave=yes gap=- missing=?;
593 Matrix
594 Cow ATGGCATATCCCATACAACTAGGATTCCAAGATGCAACATCACCAATCATAGAAGAACTA
595 Carp ATGGCACACCCAACGCAACTAGGTTTCAAGGACGCGGCCATACCCGTTATAGAGGAACTT
596 Chicken ATGGCCAACCACTCCCAACTAGGCTTTCAAGACGCCTCATCCCCCATCATAGAAGAGCTC
597 Human ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTT
598 Loach ATGGCACATCCCACACAATTAGGATTCCAAGACGCGGCCTCACCCGTAATAGAAGAACTT
599 Mouse ATGGCCTACCCATTCCAACTTGGTCTACAAGACGCCACATCCCCTATTATAGAAGAGCTA
600 Rat ATGGCTTACCCATTTCAACTTGGCTTACAAGACGCTACATCACCTATCATAGAAGAACTT
601 Seal ATGGCATACCCCCTACAAATAGGCCTACAAGATGCAACCTCTCCCATTATAGAGGAGTTA
602 Whale ATGGCATATCCATTCCAACTAGGTTTCCAAGATGCAGCATCACCCATCATAGAAGAGCTC
603 Frog ATGGCACACCCATCACAATTAGGTTTTCAAGACGCAGCCTCTCCAATTATAGAAGAATTA
604
605 Cow CTTCACTTTCATGACCACACGCTAATAATTGTCTTCTTAATTAGCTCATTAGTACTTTAC
606 Carp CTTCACTTCCACGACCACGCATTAATAATTGTGCTCCTAATTAGCACTTTAGTTTTATAT
607 Chicken GTTGAATTCCACGACCACGCCCTGATAGTCGCACTAGCAATTTGCAGCTTAGTACTCTAC
608 Human ATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCCTGTAT
609 Loach CTTCACTTCCATGACCATGCCCTAATAATTGTATTTTTGATTAGCGCCCTAGTACTTTAT
610 Mouse ATAAATTTCCATGATCACACACTAATAATTGTTTTCCTAATTAGCTCCTTAGTCCTCTAT
611 Rat ACAAACTTTCATGACCACACCCTAATAATTGTATTCCTCATCAGCTCCCTAGTACTTTAT
612 Seal CTACACTTCCATGACCACACATTAATAATTGTGTTCCTAATTAGCTCATTAGTACTCTAC
613 Whale CTACACTTTCACGATCATACACTAATAATCGTTTTTCTAATTAGCTCTTTAGTTCTCTAC
614 Frog CTTCACTTCCACGACCATACCCTCATAGCCGTTTTTCTTATTAGTACGCTAGTTCTTTAC
615
616 Cow ATTATTTCACTAATACTAACGACAAAGCTGACCCATACAAGCACGATAGATGCACAAGAA
617 Carp ATTATTACTGCAATGGTATCAACTAAACTTACTAATAAATATATTCTAGACTCCCAAGAA
618 Chicken CTTCTAACTCTTATACTTATAGAAAAACTATCA---TCAAACACCGTAGATGCCCAAGAA
619 Human GCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAA
620 Loach GTTATTATTACAACCGTCTCAACAAAACTCACTAACATATATATTTTGGACTCACAAGAA
621 Mouse ATCATCTCGCTAATATTAACAACAAAACTAACACATACAAGCACAATAGATGCACAAGAA
622 Rat ATTATTTCACTAATACTAACAACAAAACTAACACACACAAGCACAATAGACGCCCAAGAA
623 Seal ATTATCTCACTTATACTAACCACGAAACTCACCCACACAAGTACAATAGACGCACAAGAA
624 Whale ATTATTACCCTAATGCTTACAACCAAATTAACACATACTAGTACAATAGACGCCCAAGAA
625 Frog ATTATTACTATTATAATAACTACTAAACTAACTAATACAAACCTAATGGACGCACAAGAG
626
627 Cow GTAGAGACAATCTGAACCATTCTGCCCGCCATCATCTTAATTCTAATTGCTCTTCCTTCT
628 Carp ATCGAAATCGTATGAACCATTCTACCAGCCGTCATTTTAGTACTAATCGCCCTGCCCTCC
629 Chicken GTTGAACTAATCTGAACCATCCTACCCGCTATTGTCCTAGTCCTGCTTGCCCTCCCCTCC
630 Human ATAGAAACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCC
631 Loach ATTGAAATCGTATGAACTGTGCTCCCTGCCCTAATCCTCATTTTAATCGCCCTCCCCTCA
632 Mouse GTTGAAACCATTTGAACTATTCTACCAGCTGTAATCCTTATCATAATTGCTCTCCCCTCT
633 Rat GTAGAAACAATTTGAACAATTCTCCCAGCTGTCATTCTTATTCTAATTGCCCTTCCCTCC
634 Seal GTGGAAACGGTGTGAACGATCCTACCCGCTATCATTTTAATTCTCATTGCCCTACCATCA
635 Whale GTAGAAACTGTCTGAACTATCCTCCCAGCCATTATCTTAATTTTAATTGCCTTGCCTTCA
636 Frog ATCGAAATAGTGTGAACTATTATACCAGCTATTAGCCTCATCATAATTGCCCTTCCATCC
637
638 Cow TTACGAATTCTATACATAATAGATGAAATCAATAACCCATCTCTTACAGTAAAAACCATA
639 Carp CTACGCATCCTGTACCTTATAGACGAAATTAACGACCCTCACCTGACAATTAAAGCAATA
640 Chicken CTCCAAATCCTCTACATAATAGACGAAATCGACGAACCTGATCTCACCCTAAAAGCCATC
641 Human CTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATT
642 Loach CTACGAATTCTATATCTTATAGACGAGATTAATGACCCCCACCTAACAATTAAGGCCATG
643 Mouse CTACGCATTCTATATATAATAGACGAAATCAACAACCCCGTATTAACCGTTAAAACCATA
644 Rat CTACGAATTCTATACATAATAGACGAGATTAATAACCCAGTTCTAACAGTAAAAACTATA
645 Seal TTACGAATCCTCTACATAATGGACGAGATCAATAACCCTTCCTTGACCGTAAAAACTATA
646 Whale TTACGGATCCTTTACATAATAGACGAAGTCAATAACCCCTCCCTCACTGTAAAAACAATA
647 Frog CTTCGTATCCTATATTTAATAGATGAAGTTAATGATCCACACTTAACAATTAAAGCAATC
648
649 Cow GGACATCAGTGATACTGAAGCTATGAGTATACAGATTATGAGGACTTAAGCTTCGACTCC
650 Carp GGACACCAATGATACTGAAGTTACGAGTATACAGACTATGAAAATCTAGGATTCGACTCC
651 Chicken GGACACCAATGATACTGAACCTATGAATACACAGACTTCAAGGACCTCTCATTTGACTCC
652 Human GGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCC
653 Loach GGGCACCAATGATACTGAAGCTACGAGTATACTGATTATGAAAACTTAAGTTTTGACTCC
654 Mouse GGGCACCAATGATACTGAAGCTACGAATATACTGACTATGAAGACCTATGCTTTGATTCA
655 Rat GGACACCAATGATACTGAAGCTATGAATATACTGACTATGAAGACCTATGCTTTGACTCC
656 Seal GGACATCAGTGATACTGAAGCTATGAGTACACAGACTACGAAGACCTGAACTTTGACTCA
657 Whale GGTCACCAATGATATTGAAGCTATGAGTATACCGACTACGAAGACCTAAGCTTCGACTCC
658 Frog GGCCACCAATGATACTGAAGCTACGAATATACTAACTATGAGGATCTCTCATTTGACTCT
659
660 Cow TACATAATTCCAACATCAGAATTAAAGCCAGGGGAGCTACGACTATTAGAAGTCGATAAT
661 Carp TATATAGTACCAACCCAAGACCTTGCCCCCGGACAATTCCGACTTCTGGAAACAGACCAC
662 Chicken TACATAACCCCAACAACAGACCTCCCCCTAGGCCACTTCCGCCTACTAGAAGTCGACCAT
663 Human TACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTGACAAT
664 Loach TACATAATCCCCACCCAGGACCTAACCCCTGGACAATTCCGGCTACTAGAGACAGACCAC
665 Mouse TATATAATCCCAACAAACGACCTAAAACCTGGTGAACTACGACTGCTAGAAGTTGATAAC
666 Rat TACATAATCCCAACCAATGACCTAAAACCAGGTGAACTTCGTCTATTAGAAGTTGATAAT
667 Seal TATATGATCCCCACACAAGAACTAAAGCCCGGAGAACTACGACTGCTAGAAGTAGACAAT
668 Whale TATATAATCCCAACATCAGACCTAAAGCCAGGAGAACTACGATTATTAGAAGTAGATAAC
669 Frog TATATAATTCCAACTAATGACCTTACCCCTGGACAATTCCGGCTGCTAGAAGTTGATAAT
670
671 Cow CGAGTTGTACTACCAATAGAAATAACAATCCGAATGTTAGTCTCCTCTGAAGACGTATTA
672 Carp CGAATAGTTGTTCCAATAGAATCCCCAGTCCGTGTCCTAGTATCTGCTGAAGACGTGCTA
673 Chicken CGCATTGTAATCCCCATAGAATCCCCCATTCGAGTAATCATCACCGCTGATGACGTCCTC
674 Human CGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTG
675 Loach CGAATGGTTGTTCCCATAGAATCCCCTATTCGCATTCTTGTTTCCGCCGAAGATGTACTA
676 Mouse CGAGTCGTTCTGCCAATAGAACTTCCAATCCGTATATTAATTTCATCTGAAGACGTCCTC
677 Rat CGGGTAGTCTTACCAATAGAACTTCCAATTCGTATACTAATCTCATCCGAAGACGTCCTG
678 Seal CGAGTAGTCCTCCCAATAGAAATAACAATCCGCATACTAATCTCATCAGAAGATGTACTC
679 Whale CGAGTTGTCTTACCTATAGAAATAACAATCCGAATATTAGTCTCATCAGAAGACGTACTC
680 Frog CGAATAGTAGTCCCAATAGAATCTCCAACCCGACTTTTAGTTACAGCCGAAGACGTCCTC
681
682 Cow CACTCATGAGCTGTGCCCTCTCTAGGACTAAAAACAGACGCAATCCCAGGCCGTCTAAAC
683 Carp CATTCTTGAGCTGTTCCATCCCTTGGCGTAAAAATGGACGCAGTCCCAGGACGACTAAAT
684 Chicken CACTCATGAGCCGTACCCGCCCTCGGGGTAAAAACAGACGCAATCCCTGGACGACTAAAT
685 Human CACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAAC
686 Loach CACTCCTGGGCCCTTCCAGCCATGGGGGTAAAGATAGACGCGGTCCCAGGACGCCTTAAC
687 Mouse CACTCATGAGCAGTCCCCTCCCTAGGACTTAAAACTGATGCCATCCCAGGCCGACTAAAT
688 Rat CACTCATGAGCCATCCCTTCACTAGGGTTAAAAACCGACGCAATCCCCGGCCGCCTAAAC
689 Seal CACTCATGAGCCGTACCGTCCCTAGGACTAAAAACTGATGCTATCCCAGGACGACTAAAC
690 Whale CACTCATGGGCCGTACCCTCCTTGGGCCTAAAAACAGATGCAATCCCAGGACGCCTAAAC
691 Frog CACTCGTGAGCTGTACCCTCCTTGGGTGTCAAAACAGATGCAATCCCAGGACGACTTCAT
692
693 Cow CAAACAACCCTTATATCGTCCCGTCCAGGCTTATATTACGGTCAATGCTCAGAAATTTGC
694 Carp CAAGCCGCCTTTATTGCCTCACGCCCAGGGGTCTTTTACGGACAATGCTCTGAAATTTGT
695 Chicken CAAACCTCCTTCATCACCACTCGACCAGGAGTGTTTTACGGACAATGCTCAGAAATCTGC
696 Human CAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGT
697 Loach CAAACCGCCTTTATTGCCTCCCGCCCCGGGGTATTCTATGGGCAATGCTCAGAAATCTGT
698 Mouse CAAGCAACAGTAACATCAAACCGACCAGGGTTATTCTATGGCCAATGCTCTGAAATTTGT
699 Rat CAAGCTACAGTCACATCAAACCGACCAGGTCTATTCTATGGCCAATGCTCTGAAATTTGC
700 Seal CAAACAACCCTAATAACCATACGACCAGGACTGTACTACGGTCAATGCTCAGAAATCTGT
701 Whale CAAACAACCTTAATATCAACACGACCAGGCCTATTTTATGGACAATGCTCAGAGATCTGC
702 Frog CAAACATCATTTATTGCTACTCGTCCGGGAGTATTTTACGGACAATGTTCAGAAATTTGC
703
704 Cow GGGTCAAACCACAGTTTCATACCCATTGTCCTTGAGTTAGTCCCACTAAAGTACTTTGAA
705 Carp GGAGCTAATCACAGCTTTATACCAATTGTAGTTGAAGCAGTACCTCTCGAACACTTCGAA
706 Chicken GGAGCTAACCACAGCTACATACCCATTGTAGTAGAGTCTACCCCCCTAAAACACTTTGAA
707 Human GGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAA
708 Loach GGAGCAAACCACAGCTTTATACCCATCGTAGTAGAAGCGGTCCCACTATCTCACTTCGAA
709 Mouse GGATCTAACCATAGCTTTATGCCCATTGTCCTAGAAATGGTTCCACTAAAATATTTCGAA
710 Rat GGCTCAAATCACAGCTTCATACCCATTGTACTAGAAATAGTGCCTCTAAAATATTTCGAA
711 Seal GGTTCAAACCACAGCTTCATACCTATTGTCCTCGAATTGGTCCCACTATCCCACTTCGAG
712 Whale GGCTCAAACCACAGTTTCATACCAATTGTCCTAGAACTAGTACCCCTAGAAGTCTTTGAA
713 Frog GGAGCAAACCACAGCTTTATACCAATTGTAGTTGAAGCAGTACCGCTAACCGACTTTGAA
714
715 Cow AAATGATCTGCGTCAATATTA---------------------TAA
716 Carp AACTGATCCTCATTAATACTAGAAGACGCCTCGCTAGGAAGCTAA
717 Chicken GCCTGATCCTCACTA------------------CTGTCATCTTAA
718 Human ATA---------------------GGGCCCGTATTTACCCTATAG
719 Loach AACTGGTCCACCCTTATACTAAAAGACGCCTCACTAGGAAGCTAA
720 Mouse AACTGATCTGCTTCAATAATT---------------------TAA
721 Rat AACTGATCAGCTTCTATAATT---------------------TAA
722 Seal AAATGATCTACCTCAATGCTT---------------------TAA
723 Whale AAATGATCTGTATCAATACTA---------------------TAA
724 Frog AACTGATCTTCATCAATACTA---GAAGCATCACTA------AGA
725 ;
726 End;
727 """
728
729
730
731 nxs_example3 = \
732 """#NEXUS
733
734 Begin data;
735 Dimensions ntax=10 nchar=234;
736 Format datatype=protein gap=- interleave;
737 Matrix
738 Cow MAYPMQLGFQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
739 Carp MAHPTQLGFKDAAMPVMEELLHFHDHALMIVLLISTLVLYIITAMVSTKLTNKYILDSQE
740 Chicken MANHSQLGFQDASSPIMEELVEFHDHALMVALAICSLVLYLLTLMLMEKLS-SNTVDAQE
741 Human MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTLTTKLTNTNISDAQE
742 Loach MAHPTQLGFQDAASPVMEELLHFHDHALMIVFLISALVLYVIITTVSTKLTNMYILDSQE
743 Mouse MAYPFQLGLQDATSPIMEELMNFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
744 Rat MAYPFQLGLQDATSPIMEELTNFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
745 Seal MAYPLQMGLQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
746 Whale MAYPFQLGFQDAASPIMEELLHFHDHTLMIVFLISSLVLYIITLMLTTKLTHTSTMDAQE
747 Frog MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQE
748
749 Cow VETIWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLSFDS
750 Carp IEIVWTILPAVILVLIALPSLRILYLMDEINDPHLTIKAMGHQWYWSYEYTDYENLGFDS
751 Chicken VELIWTILPAIVLVLLALPSLQILYMMDEIDEPDLTLKAIGHQWYWTYEYTDFKDLSFDS
752 Human METVWTILPAIILVLIALPSLRILYMTDEVNDPSLTIKSIGHQWYWTYEYTDYGGLIFNS
753 Loach IEIVWTVLPALILILIALPSLRILYLMDEINDPHLTIKAMGHQWYWSYEYTDYENLSFDS
754 Mouse VETIWTILPAVILIMIALPSLRILYMMDEINNPVLTVKTMGHQWYWSYEYTDYEDLCFDS
755 Rat VETIWTILPAVILILIALPSLRILYMMDEINNPVLTVKTMGHQWYWSYEYTDYEDLCFDS
756 Seal VETVWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLNFDS
757 Whale VETVWTILPAIILILIALPSLRILYMMDEVNNPSLTVKTMGHQWYWSYEYTDYEDLSFDS
758 Frog IEMVWTIMPAISLIMIALPSLRILYLMDEVNDPHLTIKAIGHQWYWSYEYTNYEDLSFDS
759
760 Cow YMIPTSELKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLN
761 Carp YMVPTQDLAPGQFRLLETDHRMVVPMESPVRVLVSAEDVLHSWAVPSLGVKMDAVPGRLN
762 Chicken YMTPTTDLPLGHFRLLEVDHRIVIPMESPIRVIITADDVLHSWAVPALGVKTDAIPGRLN
763 Human YMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVLHSWAVPTLGLKTDAIPGRLN
764 Loach YMIPTQDLTPGQFRLLETDHRMVVPMESPIRILVSAEDVLHSWALPAMGVKMDAVPGRLN
765 Mouse YMIPTNDLKPGELRLLEVDNRVVLPMELPIRMLISSEDVLHSWAVPSLGLKTDAIPGRLN
766 Rat YMIPTNDLKPGELRLLEVDNRVVLPMELPIRMLISSEDVLHSWAIPSLGLKTDAIPGRLN
767 Seal YMIPTQELKPGELRLLEVDNRVVLPMEMTIRMLISSEDVLHSWAVPSLGLKTDAIPGRLN
768 Whale YMIPTSDLKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLN
769 Frog YMIPTNDLTPGQFRLLEVDNRMVVPMESPTRLLVTAEDVLHSWAVPSLGVKTDAIPGRLH
770
771 Cow QTTLMSSRPGLYYGQCSEICGSNHSFMPIVLELVPLKYFEKWSASML-------
772 Carp QAAFIASRPGVFYGQCSEICGANHSFMPIVVEAVPLEHFENWSSLMLEDASLGS
773 Chicken QTSFITTRPGVFYGQCSEICGANHSYMPIVVESTPLKHFEAWSSL------LSS
774 Human QTTFTATRPGVYYGQCSEICGANHSFMPIVLELIPLKIFEM-------GPVFTL
775 Loach QTAFIASRPGVFYGQCSEICGANHSFMPIVVEAVPLSHFENWSTLMLKDASLGS
776 Mouse QATVTSNRPGLFYGQCSEICGSNHSFMPIVLEMVPLKYFENWSASMI-------
777 Rat QATVTSNRPGLFYGQCSEICGSNHSFMPIVLEMVPLKYFENWSASMI-------
778 Seal QTTLMTMRPGLYYGQCSEICGSNHSFMPIVLELVPLSHFEKWSTSML-------
779 Whale QTTLMSTRPGLFYGQCSEICGSNHSFMPIVLELVPLEVFEKWSVSML-------
780 Frog QTSFIATRPGVFYGQCSEICGANHSFMPIVVEAVPLTDFENWSSSML-EASL--
781 ;
782 End;
783 """
784
785
786
787 sth_example = \
788 """# STOCKHOLM 1.0
789 #=GF ID CBS
790 #=GF AC PF00571
791 #=GF DE CBS domain
792 #=GF AU Bateman A
793 #=GF CC CBS domains are small intracellular modules mostly found
794 #=GF CC in 2 or four copies within a protein.
795 #=GF SQ 67
796 #=GS O31698/18-71 AC O31698
797 #=GS O83071/192-246 AC O83071
798 #=GS O83071/259-312 AC O83071
799 #=GS O31698/88-139 AC O31698
800 #=GS O31698/88-139 OS Bacillus subtilis
801 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
802 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
803 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
804 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
805 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
806 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
807 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
808 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
809 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
810 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
811 #=GR O31699/88-139 AS ________________*__________________________
812 #=GR_O31699/88-139_IN ____________1______________2__________0____
813 //
814 """
815
816
817
818 sth_example2 = \
819 """# STOCKHOLM 1.0
820 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
821 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
822 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
823 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
824 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
825
826 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
827 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
828 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
829 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
830 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
831 //"""
832
833
834
835 gbk_example = \
836 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
837 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
838 (AXL2) and Rev7p (REV7) genes, complete cds.
839 ACCESSION U49845
840 VERSION U49845.1 GI:1293613
841 KEYWORDS .
842 SOURCE Saccharomyces cerevisiae (baker's yeast)
843 ORGANISM Saccharomyces cerevisiae
844 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
845 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
846 REFERENCE 1 (bases 1 to 5028)
847 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
848 TITLE Cloning and sequence of REV7, a gene whose function is required for
849 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
850 JOURNAL Yeast 10 (11), 1503-1509 (1994)
851 PUBMED 7871890
852 REFERENCE 2 (bases 1 to 5028)
853 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
854 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
855 plasma membrane glycoprotein
856 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
857 PUBMED 8846915
858 REFERENCE 3 (bases 1 to 5028)
859 AUTHORS Roemer,T.
860 TITLE Direct Submission
861 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
862 Haven, CT, USA
863 FEATURES Location/Qualifiers
864 source 1..5028
865 /organism="Saccharomyces cerevisiae"
866 /db_xref="taxon:4932"
867 /chromosome="IX"
868 /map="9"
869 CDS <1..206
870 /codon_start=3
871 /product="TCP1-beta"
872 /protein_id="AAA98665.1"
873 /db_xref="GI:1293614"
874 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
875 AEVLLRVDNIIRARPRTANRQHM"
876 gene 687..3158
877 /gene="AXL2"
878 CDS 687..3158
879 /gene="AXL2"
880 /note="plasma membrane glycoprotein"
881 /codon_start=1
882 /function="required for axial budding pattern of S.
883 cerevisiae"
884 /product="Axl2p"
885 /protein_id="AAA98666.1"
886 /db_xref="GI:1293615"
887 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
888 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
889 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
890 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
891 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
892 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
893 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
894 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
895 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
896 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
897 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
898 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
899 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
900 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
901 VDFSNKSNVNVGQVKDIHGRIPEML"
902 gene complement(3300..4037)
903 /gene="REV7"
904 CDS complement(3300..4037)
905 /gene="REV7"
906 /codon_start=1
907 /product="Rev7p"
908 /protein_id="AAA98667.1"
909 /db_xref="GI:1293616"
910 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
911 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
912 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
913 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
914 LISGDDKILNGVYSQYEEGESIFGSLF"
915 ORIGIN
916 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
917 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
918 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
919 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
920 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
921 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
922 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
923 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
924 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
925 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
926 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
927 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
928 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
929 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
930 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
931 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
932 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
933 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
934 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
935 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
936 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
937 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
938 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
939 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
940 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
941 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
942 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
943 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
944 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
945 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
946 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
947 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
948 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
949 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
950 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
951 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
952 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
953 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
954 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
955 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
956 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
957 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
958 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
959 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
960 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
961 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
962 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
963 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
964 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
965 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
966 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
967 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
968 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
969 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
970 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
971 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
972 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
973 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
974 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
975 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
976 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
977 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
978 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
979 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
980 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
981 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
982 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
983 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
984 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
985 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
986 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
987 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
988 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
989 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
990 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
991 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
992 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
993 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
994 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
995 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
996 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
997 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
998 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
999 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1000 //"""
1001
1002
1003
1004 gbk_example2 = \
1005 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1006 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1007 ACCESSION AAD51968
1008 VERSION AAD51968.1 GI:5805369
1009 DBSOURCE locus AF171097 accession AF171097.1
1010 KEYWORDS .
1011 SOURCE Yersinia enterocolitica
1012 ORGANISM Yersinia enterocolitica
1013 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1014 Enterobacteriaceae; Yersinia.
1015 REFERENCE 1 (residues 1 to 143)
1016 AUTHORS Revell,P.A. and Miller,V.L.
1017 TITLE A chromosomally encoded regulator is required for expression of the
1018 Yersinia enterocolitica inv gene and for virulence
1019 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1020 MEDLINE 20138369
1021 PUBMED 10672189
1022 REFERENCE 2 (residues 1 to 143)
1023 AUTHORS Revell,P.A. and Miller,V.L.
1024 TITLE Direct Submission
1025 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1026 University School of Medicine, Campus Box 8230, 660 South Euclid,
1027 St. Louis, MO 63110, USA
1028 COMMENT Method: conceptual translation.
1029 FEATURES Location/Qualifiers
1030 source 1..143
1031 /organism="Yersinia enterocolitica"
1032 /mol_type="unassigned DNA"
1033 /strain="JB580v"
1034 /serotype="O:8"
1035 /db_xref="taxon:630"
1036 Protein 1..143
1037 /product="transcriptional regulator RovA"
1038 /name="regulates inv expression"
1039 CDS 1..143
1040 /gene="rovA"
1041 /coded_by="AF171097.1:380..811"
1042 /note="regulator of virulence"
1043 /transl_table=11
1044 ORIGIN
1045 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1046 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1047 121 deiellsgli dklerniiql qsk
1048 //"""
1049
1050
1051
1052 print "#########################################################"
1053 print "# Sequence Input Tests #"
1054 print "#########################################################"
1055
1056
1057
1058 tests = [
1059 (aln_example, "clustal", 8, "HISJ_E_COLI",
1060 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
1061 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
1062 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQP-TVE" + \
1063 "SLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDA" + \
1064 "AFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKED--NE" + \
1065 "LREALNKAFAEMRADGTYEKLAKKYFDFDVYGG---", True),
1066 (phy_example, "phylip", 8, "HISJ_E_COL", None, False),
1067 (nxs_example, "nexus", 8, "HISJ_E_COLI", None, True),
1068 (nxs_example2, "nexus", 10, "Frog",
1069 "ATGGCACACCCATCACAATTAGGTTTTCAAGACGCAGCCTCTCCAATTATAGAAGAATTA" + \
1070 "CTTCACTTCCACGACCATACCCTCATAGCCGTTTTTCTTATTAGTACGCTAGTTCTTTAC" + \
1071 "ATTATTACTATTATAATAACTACTAAACTAACTAATACAAACCTAATGGACGCACAAGAG" + \
1072 "ATCGAAATAGTGTGAACTATTATACCAGCTATTAGCCTCATCATAATTGCCCTTCCATCC" + \
1073 "CTTCGTATCCTATATTTAATAGATGAAGTTAATGATCCACACTTAACAATTAAAGCAATC" + \
1074 "GGCCACCAATGATACTGAAGCTACGAATATACTAACTATGAGGATCTCTCATTTGACTCT" + \
1075 "TATATAATTCCAACTAATGACCTTACCCCTGGACAATTCCGGCTGCTAGAAGTTGATAAT" + \
1076 "CGAATAGTAGTCCCAATAGAATCTCCAACCCGACTTTTAGTTACAGCCGAAGACGTCCTC" + \
1077 "CACTCGTGAGCTGTACCCTCCTTGGGTGTCAAAACAGATGCAATCCCAGGACGACTTCAT" + \
1078 "CAAACATCATTTATTGCTACTCGTCCGGGAGTATTTTACGGACAATGTTCAGAAATTTGC" + \
1079 "GGAGCAAACCACAGCTTTATACCAATTGTAGTTGAAGCAGTACCGCTAACCGACTTTGAA" + \
1080 "AACTGATCTTCATCAATACTA---GAAGCATCACTA------AGA", True),
1081 (nxs_example3, "nexus", 10, "Frog",
1082 'MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQE' + \
1083 'IEMVWTIMPAISLIMIALPSLRILYLMDEVNDPHLTIKAIGHQWYWSYEYTNYEDLSFDS' + \
1084 'YMIPTNDLTPGQFRLLEVDNRMVVPMESPTRLLVTAEDVLHSWAVPSLGVKTDAIPGRLH' + \
1085 'QTSFIATRPGVFYGQCSEICGANHSFMPIVVEAVPLTDFENWSSSML-EASL--', True),
1086 (sth_example, "stockholm", 5, "O31699/88-139",
1087 'EVMLTDIPRLHINDPIMK--GFGMVINN------GFVCVENDE', True),
1088 (sth_example2, "stockholm", 2, "AE007476.1",
1089 'AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU' + \
1090 'UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU', True),
1091 (gbk_example, "genbank", 1, "U49845.1", None, True),
1092 (gbk_example2,"genbank", 1, 'AAD51968.1',
1093 "MESTLGSDLARLVRVWRALIDHRLKPLELTQTHWVTLHNINRLPPEQSQIQLAKAIGIEQ" + \
1094 "PSLVRTLDQLEEKGLITRHTCANDRRAKRIKLTEQSSPIIEQVDGVICSTRKEILGGISP" + \
1095 "DEIELLSGLIDKLERNIIQLQSK", True),
1096 ]
1097
1098 for (data, format, rec_count, last_id, last_seq, dict_check) in tests:
1099
1100 print "%s file with %i records" % (format, rec_count)
1101
1102 print "Bio.AlignIO.read(handle, format)"
1103 alignment = read(StringIO(data), format)
1104 assert len(alignment.get_all_seqs()) == rec_count
1105
1106 print "Bio.AlignIO.read(handle, format, seq_count)"
1107 alignment = read(StringIO(data), format, rec_count)
1108 assert len(alignment.get_all_seqs()) == rec_count
1109
1110 print "Bio.AlignIO.parse(handle, format)"
1111
1112
1113 iterator = parse(StringIO(data), format=format)
1114 as_list = list(iterator)
1115
1116 assert len(as_list) == 1
1117 assert len(as_list[0].get_all_seqs()) == rec_count, \
1118 "Expected %i records, found %i" \
1119 % (rec_count, len(as_list[0].get_all_seqs()))
1120 assert as_list[0].get_all_seqs()[-1].id == last_id, \
1121 "Expected '%s' as last record ID, found '%s'" \
1122 % (last_id, as_list[0].get_all_seqs()[-1].id)
1123 if last_seq :
1124 assert as_list[0].get_all_seqs()[-1].seq.tostring() == last_seq
1125
1126 print "Bio.AlignIO.parse(handle, format, seq_count)"
1127 as_list2 = list(parse(StringIO(data), format=format, seq_count=rec_count))
1128 assert len(as_list2) == len(as_list)
1129 for a1, a2 in zip(as_list, as_list2) :
1130 assert align_cmp(a1, a2)
1131
1132 half = rec_count / 2
1133 if half*2 == rec_count :
1134
1135
1136
1137
1138 try :
1139 list(parse(StringIO(data), format=format, seq_count=half))
1140 assert format not in _FormatToIterator
1141 except ValueError, e :
1142 assert format in _FormatToIterator, \
1143 "Format %s, %s" % (format, str(e))
1144 del half
1145
1146
1147 print "Iteration using .next()"
1148
1149 iterator = parse(StringIO(data), format=format)
1150 count = 1
1151 alignment = iterator.next()
1152 assert alignment is not None
1153 assert str(alignment.__class__) == "Bio.Align.Generic.Alignment"
1154
1155 for alignment in iterator :
1156 assert len(alignment.get_all_seqs()) == len(as_list[0].get_all_seqs())
1157 count = count + 1
1158 assert count == len(as_list)
1159
1160
1161 iterator = parse(StringIO(data), format=format)
1162 count = 0
1163 while True :
1164 try :
1165 alignment = iterator.next()
1166 except StopIteration :
1167 break
1168 if alignment is None : break
1169 assert len(alignment.get_all_seqs()) == len(as_list[0].get_all_seqs())
1170 count = count + 1
1171 assert count == len(as_list)
1172
1173 print "parse(handle)"
1174 iterator = parse(StringIO(data), format=format)
1175 for (i, alignment) in enumerate(iterator) :
1176 pass
1177 assert i+1 == len(as_list)
1178
1179 if format not in ["nexus"] :
1180 print "Triple copy of data"
1181
1182
1183 iterator = parse(StringIO(data + "\n" + data + "\n" + data), format=format)
1184 triple_list = list(iterator)
1185 if format in _FormatToIterator :
1186
1187 assert len(triple_list) == 3
1188 for a in triple_list :
1189 assert len(a.get_all_seqs()) == rec_count
1190 else :
1191
1192 assert len(triple_list) == 1
1193 assert len(triple_list[0].get_all_seqs()) == 3 * rec_count
1194
1195
1196 assert 3==len(list(parse(StringIO(data + "\n" + data + "\n" + data), format, rec_count)))
1197
1198 try :
1199 alignment = read(StringIO(data + "\n" + data + "\n" + data), format, rec_count)
1200 assert False, "Should have failed"
1201 except ValueError :
1202
1203 pass
1204
1205 for out_format in _FormatToWriter :
1206 print "writing to %s" % out_format
1207
1208 handle = StringIO()
1209
1210 try :
1211 write(as_list, handle=handle, format=out_format)
1212 except ValueError, e :
1213
1214
1215 print "Failed: %s" % str(e)
1216
1217 continue
1218
1219 print
1220
1221 print "#########################################################"
1222 print "# AlignIO Tests finished #"
1223 print "#########################################################"
1224