Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for the "nexus" file format. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions 
  9  (or the Bio.SeqIO functions). 
 10   
 11  See also the Bio.Nexus module (which this code calls internally), 
 12  as this offers more than just accessing the alignment or its 
 13  sequences as SeqRecord objects. 
 14  """ 
 15   
 16  from Bio.Nexus import Nexus 
 17  from Bio.Align.Generic import Alignment 
 18  from Bio.SeqRecord import SeqRecord 
 19   
 20  #You can get a couple of example files here: 
 21  #http://www.molecularevolution.org/resources/fileformats/ 
 22       
 23  #This is a generator function! 
24 -def NexusIterator(handle, seq_count=None) :
25 """Returns SeqRecord objects from a Nexus file. 26 27 Thus uses the Bio.Nexus module to do the hard work. 28 29 NOTE - We only expect ONE alignment matrix per Nexus file, 30 meaning this iterator will only yield one Alignment.""" 31 n = Nexus.Nexus(handle) 32 if not n.matrix : 33 #No alignment found 34 raise StopIteration 35 alignment = Alignment(n.alphabet) 36 37 #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 38 #The original names and the modified names are kept in these two lists: 39 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 40 41 if seq_count : 42 assert seq_count == len(n.unaltered_taxlabels) 43 44 for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels) : 45 assert new_name.startswith(old_name) 46 seq = n.matrix[new_name] #already a Seq object with the alphabet set 47 #ToDo - Can we extract any annotation too? 48 #ToDo - Avoid abusing the private _records list 49 alignment._records.append(SeqRecord(seq, 50 id=new_name, 51 name=old_name, 52 description="")) 53 #All done 54 yield alignment
55 56 if __name__ == "__main__" : 57 from StringIO import StringIO 58 print "Quick self test" 59 print 60 print "Repeated names without a TAXA block" 61 handle = StringIO("""#NEXUS 62 [TITLE: NoName] 63 64 begin data; 65 dimensions ntax=4 nchar=50; 66 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 67 68 matrix 69 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 70 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 71 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 72 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 73 ; 74 end; 75 """) 76 for a in NexusIterator(handle) : 77 print a 78 for r in a : 79 print repr(r.seq), r.name, r.id 80 print "Done" 81 82 print 83 print "Repeated names with a TAXA block" 84 handle = StringIO("""#NEXUS 85 [TITLE: NoName] 86 87 begin taxa 88 CYS1_DICDI 89 ALEU_HORVU 90 CATH_HUMAN 91 CYS1_DICDI; 92 end; 93 94 begin data; 95 dimensions ntax=4 nchar=50; 96 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 97 98 matrix 99 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 100 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 101 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 102 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 103 ; 104 end; 105 """) 106 for a in NexusIterator(handle) : 107 print a 108 for r in a : 109 print repr(r.seq), r.name, r.id 110 print "Done" 111 print 112 print "Reading an empty file" 113 assert 0 == len(list(NexusIterator(StringIO()))) 114 print "Done" 115