Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Nexus.py - a NEXUS parser 
   2  # 
   3  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license. Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  #  
   8  # Bug reports welcome: fkauff@biologie.uni-kl.de 
   9  # 
  10   
  11  """Nexus class. Parse the contents of a nexus file. 
  12  Based upon 'NEXUS: An extensible file format for systematic information' 
  13  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  14  """ 
  15   
  16  import os,sys, math, random, copy 
  17  import sets 
  18   
  19  from Bio.Alphabet import IUPAC 
  20  from Bio.Data import IUPACData 
  21  from Bio.Seq import Seq 
  22   
  23  from Trees import Tree,NodeData 
  24   
  25  try: 
  26      import cnexus 
  27  except ImportError: 
  28      C=False 
  29  else: 
  30      C=True 
  31   
  32  INTERLEAVE=70 
  33  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ 
  34          'matrix','tree', 'utree','translate','codonposset','title'] 
  35  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons'] 
  36  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  37  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  38  WHITESPACE=' \t\n' 
  39  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  40  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  41  CHARSET='chars' 
  42  TAXSET='taxa' 
  43  CODONPOSITIONS='codonpositions' 
  44  DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
45 -class NexusError(Exception): pass
46
47 -class CharBuffer:
48 """Helps reading NEXUS-words and characters from a buffer."""
49 - def __init__(self,string):
50 if string: 51 self.buffer=list(string) 52 else: 53 self.buffer=[]
54
55 - def peek(self):
56 if self.buffer: 57 return self.buffer[0] 58 else: 59 return None
60
61 - def peek_nonwhitespace(self):
62 b=''.join(self.buffer).strip() 63 if b: 64 return b[0] 65 else: 66 return None
67
68 - def next(self):
69 if self.buffer: 70 return self.buffer.pop(0) 71 else: 72 return None
73
74 - def next_nonwhitespace(self):
75 while True: 76 p=self.next() 77 if p is None: 78 break 79 if p not in WHITESPACE: 80 return p 81 return None
82
83 - def skip_whitespace(self):
84 while self.buffer[0] in WHITESPACE: 85 self.buffer=self.buffer[1:]
86
87 - def next_until(self,target):
88 for t in target: 89 try: 90 pos=self.buffer.index(t) 91 except ValueError: 92 pass 93 else: 94 found=''.join(self.buffer[:pos]) 95 self.buffer=self.buffer[pos:] 96 return found 97 else: 98 return None
99
100 - def peek_word(self,word):
101 return ''.join(self.buffer[:len(word)])==word
102
103 - def next_word(self):
104 """Return the next NEXUS word from a string, dealing with single and double quotes, 105 whitespace and punctuation. 106 """ 107 108 word=[] 109 quoted=False 110 first=self.next_nonwhitespace() # get first character 111 if not first: # return empty if only whitespace left 112 return None 113 word.append(first) 114 if first=="'": # word starts with a quote 115 quoted=True 116 elif first in PUNCTUATION: # if it's punctuation, return immediately 117 return first 118 while True: 119 c=self.peek() 120 if c=="'": # a quote? 121 word.append(self.next()) # store quote 122 if self.peek()=="'": # double quote 123 skip=self.next() # skip second quote 124 elif quoted: # second single quote ends word 125 break 126 elif quoted: 127 word.append(self.next()) # if quoted, then add anything 128 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 129 break 130 else: 131 word.append(self.next()) # standard character 132 return ''.join(word)
133
134 - def rest(self):
135 """Return the rest of the string without parsing.""" 136 return ''.join(self.buffer)
137
138 -class StepMatrix:
139 """Calculate a stepmatrix for weighted parsimony. 140 See Wheeler (1990), Cladistics 6:269-275. 141 """ 142
143 - def __init__(self,symbols,gap):
144 self.data={} 145 self.symbols=[s for s in symbols] 146 self.symbols.sort() 147 if gap: 148 self.symbols.append(gap) 149 for x in self.symbols: 150 for y in [s for s in self.symbols if s!=x]: 151 self.set(x,y,0)
152
153 - def set(self,x,y,value):
154 if x>y: 155 x,y=y,x 156 self.data[x+y]=value
157
158 - def add(self,x,y,value):
159 if x>y: 160 x,y=y,x 161 self.data[x+y]+=value
162
163 - def sum(self):
164 return reduce(lambda x,y:x+y,self.data.values())
165
166 - def transformation(self):
167 total=self.sum() 168 if total!=0: 169 for k in self.data: 170 self.data[k]=self.data[k]/float(total) 171 return self
172
173 - def weighting(self):
174 for k in self.data: 175 if self.data[k]!=0: 176 self.data[k]=-math.log(self.data[k]) 177 return self
178
179 - def smprint(self,name='your_name_here'):
180 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 181 matrix+=' %s\n' % ' '.join(self.symbols) 182 for x in self.symbols: 183 matrix+='[%s]'.ljust(8) % x 184 for y in self.symbols: 185 if x==y: 186 matrix+=' . ' 187 else: 188 if x>y: 189 x1,y1=y,x 190 else: 191 x1,y1=x,y 192 if self.data[x1+y1]==0: 193 matrix+='inf. ' 194 else: 195 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 196 matrix+='\n' 197 matrix+=';\n' 198 return matrix
199
200 -def safename(name,mrbayes=False):
201 """Return a taxon identifier according to NEXUS standard. 202 Wrap quotes around names with punctuation or whitespace, and double single quotes. 203 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes. 204 """ 205 if mrbayes: 206 safe=name.replace(' ','_') 207 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 208 else: 209 safe=name.replace("'","''") 210 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)): 211 safe="'"+safe+"'" 212 return safe
213
214 -def quotestrip(word):
215 """Remove quotes and/or double quotes around identifiers.""" 216 if not word: 217 return None 218 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 219 word=word[1:-1] 220 return word
221
222 -def get_start_end(sequence, skiplist=['-','?']):
223 """Return position of first and last character which is not in skiplist (defaults to ['-','?']).""" 224 225 length=len(sequence) 226 if length==0: 227 return None,None 228 end=length-1 229 while end>=0 and (sequence[end] in skiplist): 230 end-=1 231 start=0 232 while start<length and (sequence[start] in skiplist): 233 start+=1 234 if start==length and end==-1: # empty sequence 235 return -1,-1 236 else: 237 return start,end
238
239 -def _sort_keys_by_values(p):
240 """Returns a sorted list of keys of p sorted by values of p.""" 241 startpos=[(p[pn],pn) for pn in p if p[pn]] 242 startpos.sort() 243 return zip(*startpos)[1]
244
245 -def _make_unique(l):
246 """Check that all values in list are unique and return a pruned and sorted list.""" 247 l=list(sets.Set(l)) 248 l.sort() 249 return l
250
251 -def _unique_label(previous_labels,label):
252 """Returns a unique name if label is already in previous_labels.""" 253 while label in previous_labels: 254 if label.split('.')[-1].startswith('copy'): 255 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1) 256 else: 257 label+='.copy' 258 return label
259
260 -def _seqmatrix2strmatrix(matrix):
261 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 262 return dict([(t,matrix[t].tostring()) for t in matrix])
263
264 -def _compact4nexus(orig_list):
265 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 266 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 267 """ 268 269 if not orig_list: 270 return '' 271 orig_list=list(sets.Set(orig_list)) 272 orig_list.sort() 273 shortlist=[] 274 clist=orig_list[:] 275 clist.append(clist[-1]+.5) # dummy value makes it easier 276 while len(clist)>1: 277 step=1 278 for i,x in enumerate(clist): 279 if x==clist[0]+i*step: # are we still in the right step? 280 continue 281 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 282 # second element, and possibly at least 3 elements to link, 283 # and the next one is in the right step 284 step=x-clist[0] 285 else: # pattern broke, add all values before current position to new list 286 sub=clist[:i] 287 if len(sub)==1: 288 shortlist.append(str(sub[0]+1)) 289 else: 290 if step==1: 291 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 292 else: 293 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 294 clist=clist[i:] 295 break 296 return ' '.join(shortlist)
297
298 -def combine(matrices):
299 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 300 301 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 302 Character sets, character partitions and taxon sets are prefixed, readjusted and present in 303 the combined matrix. 304 """ 305 306 if not matrices: 307 return None 308 name=matrices[0][0] 309 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 310 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1) 311 if mixed_datatypes: 312 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 313 # raise NexusError, 'Matrices must be of same datatype' 314 combined.charlabels=None 315 combined.statelabels=None 316 combined.interleave=False 317 combined.translate=None 318 319 # rename taxon sets and character sets and name them with prefix 320 for cn,cs in combined.charsets.items(): 321 combined.charsets['%s.%s' % (name,cn)]=cs 322 del combined.charsets[cn] 323 for tn,ts in combined.taxsets.items(): 324 combined.taxsets['%s.%s' % (name,tn)]=ts 325 del combined.taxsets[tn] 326 # previous partitions usually don't make much sense in combined matrix 327 # just initiate one new partition parted by single matrices 328 combined.charpartitions={'combined':{name:range(combined.nchar)}} 329 for n,m in matrices[1:]: # add all other matrices 330 both=[t for t in combined.taxlabels if t in m.taxlabels] 331 combined_only=[t for t in combined.taxlabels if t not in both] 332 m_only=[t for t in m.taxlabels if t not in both] 333 for t in both: 334 # concatenate sequences and unify gap and missing character symbols 335 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 336 # replace date of missing taxa with symbol for missing data 337 for t in combined_only: 338 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 339 for t in m_only: 340 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 341 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 342 combined.taxlabels.extend(m_only) # new taxon list 343 for cn,cs in m.charsets.items(): # adjust character sets for new matrix 344 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 345 if m.taxsets: 346 if not combined.taxsets: 347 combined.taxsets={} 348 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets 349 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition 350 # update charlabels 351 if m.charlabels: 352 if not combined.charlabels: 353 combined.charlabels={} 354 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()])) 355 combined.nchar+=m.nchar # update nchar and ntax 356 combined.ntax+=len(m_only) 357 358 # some prefer partitions, some charsets: 359 # make separate charset for ecah initial dataset 360 for c in combined.charpartitions['combined']: 361 combined.charsets[c]=combined.charpartitions['combined'][c] 362 363 return combined
364
365 -def _kill_comments_and_break_lines(text):
366 """Delete []-delimited comments out of a file and break into lines separated by ';'. 367 368 stripped_text=_kill_comments_and_break_lines(text): 369 Nested and multiline comments are allowed. [ and ] symbols within single 370 or double quotes are ignored, newline ends a quote, all symbols with quotes are 371 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 372 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 373 Quotes inside special [& and [\ are treated as normal characters, 374 but no nesting inside these special comments allowed (like [& [\ ]]). 375 ';' ist deleted from end of line. 376 377 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 378 """ 379 contents=CharBuffer(text) 380 newtext=[] 381 newline=[] 382 quotelevel='' 383 speciallevel=False 384 commlevel=0 385 while True: 386 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character 387 #if not plain: 388 # newline.append(contents.rest) # not found, just add the rest 389 # break 390 #newline.append(plain) # add intermediate text 391 t=contents.next() # and get special character 392 if t is None: 393 break 394 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 395 quotelevel='' 396 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 397 quotelevel=t 398 elif not quotelevel and t=='[': # opening bracket outside a quote 399 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 400 speciallevel=True 401 else: 402 commlevel+=1 403 elif not quotelevel and t==']': # closing bracket ioutside a quote 404 if speciallevel: 405 speciallevel=False 406 else: 407 commlevel-=1 408 if commlevel<0: 409 raise NexusError, 'Nexus formatting error: unmatched ]' 410 continue 411 if commlevel==0: # copy if we're not in comment 412 if t==';' and not quotelevel: 413 newtext.append(''.join(newline)) 414 newline=[] 415 else: 416 newline.append(t) 417 #level of comments should be 0 at the end of the file 418 if newline: 419 newtext.append('\n'.join(newline)) 420 if commlevel>0: 421 raise NexusError, 'Nexus formatting error: unmatched [' 422 return newtext
423 424
425 -def _adjust_lines(lines):
426 """Adjust linebreaks to match ';', strip leading/trailing whitespace 427 428 list_of_commandlines=_adjust_lines(input_text) 429 Lines are adjusted so that no linebreaks occur within a commandline 430 (except matrix command line) 431 """ 432 formatted_lines=[] 433 for l in lines: 434 #Convert line endings 435 l=l.replace('\r\n','\n').replace('\r','\n').strip() 436 if l.lower().startswith('matrix'): 437 formatted_lines.append(l) 438 else: 439 l=l.replace('\n',' ') 440 if l: 441 formatted_lines.append(l) 442 return formatted_lines
443
444 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
445 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 446 447 opening=seq.find('(') 448 while opening>-1: 449 closing=seq.find(')') 450 if closing<0: 451 raise NexusError, 'Missing closing parenthesis in: '+seq 452 elif closing<opening: 453 raise NexusError, 'Missing opening parenthesis in: '+seq 454 ambig=[x for x in seq[opening+1:closing]] 455 ambig.sort() 456 ambig=''.join(ambig) 457 ambig_code=rev_ambig_values[ambig.upper()] 458 if ambig!=ambig.upper(): 459 ambig_code=ambig_code.lower() 460 seq=seq[:opening]+ambig_code+seq[closing+1:] 461 opening=seq.find('(') 462 return seq
463
464 -class Commandline:
465 """Represent a commandline as command and options.""" 466
467 - def __init__(self, line, title):
468 self.options={} 469 options=[] 470 self.command=None 471 try: 472 #Assume matrix (all other command lines have been stripped of \n) 473 self.command, options = line.strip().split('\n', 1) 474 except ValueError: #Not matrix 475 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 476 self.command=line.split()[0] 477 options=' '.join(line.split()[1:]) 478 self.command = self.command.strip().lower() 479 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 480 self.options=options.strip() 481 else: 482 if len(options) > 0: 483 try: 484 options = options.replace('=', ' = ').split() 485 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 486 indices = [] 487 for sl in valued_indices: 488 indices.extend(sl) 489 token_indices = [n for n in range(len(options)) if n not in indices] 490 for opt in valued_indices: 491 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 492 self.options[options[opt[0]].lower()] = options[opt[2]] 493 for token in token_indices: 494 self.options[options[token].lower()] = None 495 except ValueError: 496 raise NexusError, 'Incorrect formatting in line: %s' % line
497
498 -class Block:
499 """Represent a NEXUS block with block name and list of commandlines ."""
500 - def __init__(self,title=None):
501 self.title=title 502 self.commandlines=[]
503
504 -class Nexus(object):
505 506 __slots__=['original_taxon_order','__dict__'] 507
508 - def __init__(self, input=None):
509 self.ntax=0 # number of taxa 510 self.nchar=0 # number of characters 511 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.) 512 self.taxlabels=[] # labels for taxa, ordered by their id 513 self.charlabels=None # ... and for characters 514 self.statelabels=None # ... and for states 515 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 516 self.respectcase=False # case sensitivity 517 self.missing='?' # symbol for missing characters 518 self.gap='-' # symbol for gap 519 self.symbols=None # set of symbols 520 self.equate=None # set of symbol synonyms 521 self.matchchar=None # matching char for matrix representation 522 self.labels=None # left, right, no 523 self.transpose=False # whether matrix is transposed 524 self.interleave=False # whether matrix is interleaved 525 self.tokens=False # unsupported 526 self.eliminate=None # unsupported 527 self.matrix=None # ... 528 self.unknown_blocks=[] # blocks we don't care about 529 self.taxsets={} 530 self.charsets={} 531 self.charpartitions={} 532 self.taxpartitions={} 533 self.trees=[] # list of Trees (instances of Tree class) 534 self.translate=None # Dict to translate taxon <-> taxon numbers 535 self.structured=[] # structured input representation 536 self.set={} # dict of the set command to set various options 537 self.options={} # dict of the options command in the data block 538 self.codonposset=None # name of the charpartition that defines codon positions 539 540 # some defaults 541 self.options['gapmode']='missing' 542 543 if input: 544 self.read(input) 545 else: 546 self.read(DEFAULTNEXUS)
547
548 - def get_original_taxon_order(self):
549 """Included for backwards compatibility.""" 550 return self.taxlabels
551 - def set_original_taxon_order(self,value):
552 """Included for backwards compatibility.""" 553 self.taxlabels=value
554 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 555
556 - def read(self,input):
557 """Read and parse NEXUS imput (filename, file-handle, string.""" 558 559 # 1. Assume we have the name of a file in the execution dir 560 # Note we need to add parsing of the path to dir/filename 561 try: 562 file_contents = open(os.path.expanduser(input),'rU').read() 563 self.filename=input 564 except (TypeError,IOError,AttributeError): 565 #2 Assume we have a string from a fh.read() 566 if isinstance(input, str): 567 file_contents = input 568 self.filename='input_string' 569 #3 Assume we have a file object 570 elif hasattr(input,'read'): # file objects or StringIO objects 571 file_contents=input.read() 572 if hasattr(input,"name") and input.name: 573 self.filename=input.name 574 else: 575 self.filename='Unknown_nexus_file' 576 else: 577 print input.strip()[:50] 578 raise NexusError, 'Unrecognized input: %s ...' % input[:100] 579 file_contents=file_contents.strip() 580 if file_contents.startswith('#NEXUS'): 581 file_contents=file_contents[6:] 582 if C: 583 decommented=cnexus.scanfile(file_contents) 584 #check for unmatched parentheses 585 if decommented=='[' or decommented==']': 586 raise NexusError, 'Unmatched %s' % decommented 587 # cnexus can't return lists, so in analogy we separate commandlines with chr(7) 588 # (a character that shoudn't be part of a nexus file under normal circumstances) 589 commandlines=_adjust_lines(decommented.split(chr(7))) 590 else: 591 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) 592 # get rid of stupid 'NEXUS token' 593 try: 594 if commandlines[0][:6].upper()=='#NEXUS': 595 commandlines[0]=commandlines[0][6:].strip() 596 except: 597 pass 598 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 599 nexus_block_gen = self._get_nexus_block(commandlines) 600 while 1: 601 try: 602 title, contents = nexus_block_gen.next() 603 except StopIteration: 604 break 605 if title in KNOWN_NEXUS_BLOCKS: 606 self._parse_nexus_block(title, contents) 607 else: 608 self._unknown_nexus_block(title, contents)
609
610 - def _get_nexus_block(self,file_contents):
611 """Generator for looping through Nexus blocks.""" 612 inblock=False 613 blocklines=[] 614 while file_contents: 615 cl=file_contents.pop(0) 616 if cl.lower().startswith('begin'): 617 if not inblock: 618 inblock=True 619 title=cl.split()[1].lower() 620 else: 621 raise NexusError('Illegal block nesting in block %s' % title) 622 elif cl.lower().startswith('end'): 623 if inblock: 624 inblock=False 625 yield title,blocklines 626 blocklines=[] 627 else: 628 raise NexusError('Unmatched \'end\'.') 629 elif inblock: 630 blocklines.append(cl)
631
632 - def _unknown_nexus_block(self,title, contents):
633 block = Block() 634 block.commandlines.append(contents) 635 block.title = title 636 self.unknown_blocks.append(block)
637
638 - def _parse_nexus_block(self,title, contents):
639 """Parse a known Nexus Block """ 640 # attached the structered block representation 641 self._apply_block_structure(title, contents) 642 #now check for taxa,characters,data blocks. If this stuff is defined more than once 643 #the later occurences will override the previous ones. 644 block=self.structured[-1] 645 for line in block.commandlines: 646 try: 647 getattr(self,'_'+line.command)(line.options) 648 except AttributeError: 649 raise 650 raise NexusError, 'Unknown command: %s ' % line.command
651
652 - def _title(self,options):
653 pass
654
655 - def _dimensions(self,options):
656 if options.has_key('ntax'): 657 self.ntax=eval(options['ntax']) 658 if options.has_key('nchar'): 659 self.nchar=eval(options['nchar'])
660
661 - def _format(self,options):
662 # print options 663 # we first need to test respectcase, then symbols (which depends on respectcase) 664 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 665 # dicts for ambiguous values and alphabet 666 if options.has_key('respectcase'): 667 self.respectcase=True 668 # adjust symbols to for respectcase 669 if options.has_key('symbols'): 670 self.symbols=options['symbols'] 671 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 672 (self.symbold.startswith("'") and self.symbols.endswith("'")): 673 self.symbols=self.symbols[1:-1].replace(' ','') 674 if not self.respectcase: 675 self.symbols=self.symbols.lower()+self.symbols.upper() 676 self.symbols=list(sets.Set(self.symbols)) 677 if options.has_key('datatype'): 678 self.datatype=options['datatype'].lower() 679 if self.datatype=='dna' or self.datatype=='nucleotide': 680 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna) 681 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values) 682 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters) 683 elif self.datatype=='rna': 684 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna) 685 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values) 686 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters) 687 elif self.datatype=='protein': 688 self.alphabet=copy.deepcopy(IUPAC.protein) 689 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it 690 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon 691 elif self.datatype=='standard': 692 raise NexusError('Datatype standard is not yet supported.') 693 #self.alphabet=None 694 #self.ambiguous_values={} 695 #if not self.symbols: 696 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 697 #self.unambiguous_letters=self.symbols 698 else: 699 raise NexusError, 'Unsupported datatype: '+self.datatype 700 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters 701 if not self.respectcase: 702 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 703 #we have to sort the reverse ambig coding dict key characters: 704 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 705 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X']) 706 self.rev_ambiguous_values={} 707 for (k,v) in rev.items(): 708 key=[c for c in k] 709 key.sort() 710 self.rev_ambiguous_values[''.join(key)]=v 711 #overwrite symbols for datype rna,dna,nucleotide 712 if self.datatype in ['dna','rna','nucleotide']: 713 self.symbols=self.alphabet.letters 714 if self.missing not in self.ambiguous_values: 715 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 716 self.ambiguous_values[self.gap]=self.gap 717 elif self.datatype=='standard': 718 if not self.symbols: 719 self.symbols=['1','0'] 720 if options.has_key('missing'): 721 self.missing=options['missing'][0] 722 if options.has_key('gap'): 723 self.gap=options['gap'][0] 724 if options.has_key('equate'): 725 self.equate=options['equate'] 726 if options.has_key('matchchar'): 727 self.matchchar=options['matchchar'][0] 728 if options.has_key('labels'): 729 self.labels=options['labels'] 730 if options.has_key('transpose'): 731 raise NexusError, 'TRANSPOSE is not supported!' 732 self.transpose=True 733 if options.has_key('interleave'): 734 if options['interleave']==None or options['interleave'].lower()=='yes': 735 self.interleave=True 736 if options.has_key('tokens'): 737 self.tokens=True 738 if options.has_key('notokens'): 739 self.tokens=False
740 741
742 - def _set(self,options):
743 self.set=options;
744
745 - def _options(self,options):
746 self.options=options;
747
748 - def _eliminate(self,options):
749 self.eliminate=options
750
751 - def _taxlabels(self,options):
752 """Get taxon labels. As the taxon names are already in the matrix, this is superfluous except for transpose matrices, 753 which are currently unsupported anyway. 754 Thus, we ignore the taxlabels command to make handling of duplicate taxon names easier. 755 """ 756 pass
757 #self.taxlabels=[] 758 #opts=CharBuffer(options) 759 #while True: 760 # taxon=quotestrip(opts.next_word()) 761 # if not taxon: 762 # break 763 # self.taxlabels.append(taxon) 764
765 - def _check_taxlabels(self,taxon):
766 """Check for presence of taxon in self.taxlabels.""" 767 # According to NEXUS standard, underscores shall be treated as spaces..., 768 # so checking for identity is more difficult 769 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 770 nexid=taxon.replace(' ','_') 771 return nextaxa.get(nexid)
772
773 - def _charlabels(self,options):
774 self.charlabels={} 775 opts=CharBuffer(options) 776 while True: 777 try: 778 # get id and state 779 w=opts.next_word() 780 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 781 break 782 identifier=self._resolve(w,set_type=CHARSET) 783 state=quotestrip(opts.next_word()) 784 self.charlabels[identifier]=state 785 # check for comma or end of command 786 c=opts.next_nonwhitespace() 787 if c is None: 788 break 789 elif c!=',': 790 raise NexusError,'Missing \',\' in line %s.' % options 791 except NexusError: 792 raise 793 except: 794 raise NexusError,'Format error in line %s.' % options
795
796 - def _charstatelabels(self,options):
797 # warning: charstatelabels supports only charlabels-syntax! 798 self._charlabels(options)
799
800 - def _statelabels(self,options):
801 #self.charlabels=options 802 #print 'Command statelabels is not supported and will be ignored.' 803 pass
804
805 - def _matrix(self,options):
806 if not self.ntax or not self.nchar: 807 raise NexusError,'Dimensions must be specified before matrix!' 808 self.matrix={} 809 taxcount=0 810 first_matrix_block=True 811 812 #eliminate empty lines and leading/trailing whitespace 813 lines=[l.strip() for l in options.split('\n') if l.strip()<>''] 814 lineiter=iter(lines) 815 while 1: 816 try: 817 l=lineiter.next() 818 except StopIteration: 819 if taxcount<self.ntax: 820 raise NexusError, 'Not enough taxa in matrix.' 821 elif taxcount>self.ntax: 822 raise NexusError, 'Too many taxa in matrix.' 823 else: 824 break 825 # count the taxa and check for interleaved matrix 826 taxcount+=1 827 ##print taxcount 828 if taxcount>self.ntax: 829 if not self.interleave: 830 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?' 831 else: 832 taxcount=1 833 first_matrix_block=False 834 #get taxon name and sequence 835 linechars=CharBuffer(l) 836 id=quotestrip(linechars.next_word()) 837 l=linechars.rest().strip() 838 chars='' 839 if self.interleave: 840 #interleaved matrix 841 #print 'In interleave' 842 if l: 843 chars=''.join(l.split()) 844 else: 845 chars=''.join(lineiter.next().split()) 846 else: 847 #non-interleaved matrix 848 chars=''.join(l.split()) 849 while len(chars)<self.nchar: 850 l=lineiter.next() 851 chars+=''.join(l.split()) 852 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 853 #first taxon has the reference sequence if matchhar is used 854 if taxcount==1: 855 refseq=iupac_seq 856 else: 857 if self.matchchar: 858 while 1: 859 p=iupac_seq.tostring().find(self.matchchar) 860 if p==-1: 861 break 862 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) 863 #check for invalid characters 864 for i,c in enumerate(iupac_seq.tostring()): 865 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 866 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\ 867 % (id,c,l[i-10:i+10]) 868 #add sequence to matrix 869 if first_matrix_block: 870 self.unaltered_taxlabels.append(id) 871 id=_unique_label(self.matrix.keys(),id) 872 self.matrix[id]=iupac_seq 873 self.taxlabels.append(id) 874 else: 875 # taxon names need to be in the same order in each interleaved block 876 id=_unique_label(self.taxlabels[:taxcount-1],id) 877 taxon_present=self._check_taxlabels(id) 878 if taxon_present: 879 self.matrix[taxon_present]+=iupac_seq 880 else: 881 raise NexusError, 'Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id 882 #check all sequences for length according to nchar 883 for taxon in self.matrix: 884 if len(self.matrix[taxon])!=self.nchar: 885 raise NexusError,'Matrx Nchar %d does not match data length (%d) for taxon %s' % (self.nchar, len(self.matrix[taxon]),taxon) 886 #check that taxlabels is identical with matrix.keys. If not, it's a problem 887 matrixkeys=self.matrix.keys() 888 matrixkeys.sort() 889 taxlabelssort=self.taxlabels[:] 890 taxlabelssort.sort() 891 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
892
893 - def _translate(self,options):
894 self.translate={} 895 opts=CharBuffer(options) 896 while True: 897 try: 898 # get id and state 899 identifier=int(opts.next_word()) 900 label=quotestrip(opts.next_word()) 901 self.translate[identifier]=label 902 # check for comma or end of command 903 c=opts.next_nonwhitespace() 904 if c is None: 905 break 906 elif c!=',': 907 raise NexusError,'Missing \',\' in line %s.' % options 908 except NexusError: 909 raise 910 except: 911 raise NexusError,'Format error in line %s.' % options
912
913 - def _utree(self,options):
914 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 915 self._tree(options)
916
917 - def _tree(self,options):
918 opts=CharBuffer(options) 919 name=opts.next_word() 920 if opts.next_nonwhitespace()!='=': 921 raise NexusError,'Syntax error in tree description: %s' % options[:50] 922 rooted=False 923 weight=1.0 924 while opts.peek_nonwhitespace()=='[': 925 open=opts.next_nonwhitespace() 926 symbol=opts.next() 927 if symbol!='&': 928 raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50]) 929 special=opts.next() 930 value=opts.next_until(']') 931 closing=opts.next() 932 if special=='R': 933 rooted=True 934 elif special=='U': 935 rooted=False 936 elif special=='W': 937 weight=float(value) 938 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 939 # if there's an active translation table, translate 940 if self.translate: 941 for n in tree.get_terminals(): 942 try: 943 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 944 except (ValueError,KeyError): 945 raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon 946 self.trees.append(tree)
947
948 - def _apply_block_structure(self,title,lines):
949 block=Block('') 950 block.title = title 951 for line in lines: 952 block.commandlines.append(Commandline(line, title)) 953 self.structured.append(block)
954
955 - def _taxset(self, options):
956 name,taxa=self._get_indices(options,set_type=TAXSET) 957 self.taxsets[name]=_make_unique(taxa)
958
959 - def _charset(self, options):
960 name,sites=self._get_indices(options,set_type=CHARSET) 961 self.charsets[name]=_make_unique(sites)
962
963 - def _taxpartition(self, options):
964 taxpartition={} 965 quotelevel=False 966 opts=CharBuffer(options) 967 name=self._name_n_vector(opts) 968 if not name: 969 raise NexusError, 'Formatting error in taxpartition: %s ' % options 970 # now collect thesubbpartitions and parse them 971 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 972 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 973 sub='' 974 while True: 975 w=opts.next() 976 if w is None or (w==',' and not quotelevel): 977 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 978 taxpartition[subname]=_make_unique(subindices) 979 sub='' 980 if w is None: 981 break 982 else: 983 if w=="'": 984 quotelevel=not quotelevel 985 sub+=w 986 self.taxpartitions[name]=taxpartition
987
988 - def _codonposset(self,options):
989 """Read codon positions from a codons block as written from McClade. 990 Here codonposset is just a fancy name for a character partition with the name CodonPositions and the partitions N,1,2,3 991 """ 992 993 prev_partitions=self.charpartitions.keys() 994 self._charpartition(options) 995 # mcclade calls it CodonPositions, but you never know... 996 codonname=[n for n in self.charpartitions.keys() if n not in prev_partitions] 997 if codonname==[] or len(codonname)>1: 998 raise NexusError, 'Formatting Error in codonposset: %s ' % options 999 else: 1000 self.codonposset=codonname[0]
1001
1002 - def _codeset(self,options):
1003 pass
1004
1005 - def _charpartition(self, options):
1006 charpartition={} 1007 quotelevel=False 1008 opts=CharBuffer(options) 1009 name=self._name_n_vector(opts) 1010 if not name: 1011 raise NexusError, 'Formatting error in charpartition: %s ' % options 1012 # now collect thesubbpartitions and parse them 1013 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1014 sub='' 1015 while True: 1016 w=opts.next() 1017 if w is None or (w==',' and not quotelevel): 1018 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 1019 charpartition[subname]=_make_unique(subindices) 1020 sub='' 1021 if w is None: 1022 break 1023 else: 1024 if w=="'": 1025 quotelevel=not quotelevel 1026 sub+=w 1027 self.charpartitions[name]=charpartition
1028
1029 - def _get_indices(self,options,set_type=CHARSET,separator='='):
1030 """Parse the taxset/charset specification 1031 '1 2 3 - 5 dog cat 10 - 20 \\ 3' --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1032 """ 1033 opts=CharBuffer(options) 1034 name=self._name_n_vector(opts,separator=separator) 1035 indices=self._parse_list(opts,set_type=set_type) 1036 if indices is None: 1037 raise NexusError, 'Formatting error in line: %s ' % options 1038 return name,indices
1039
1040 - def _name_n_vector(self,opts,separator='='):
1041 """Extract name and check that it's not in vector format.""" 1042 rest=opts.rest() 1043 name=opts.next_word() 1044 # we ignore * before names 1045 if name=='*': 1046 name=opts.next_word() 1047 if not name: 1048 raise NexusError, 'Formatting error in line: %s ' % rest 1049 name=quotestrip(name) 1050 if opts.peek_nonwhitespace=='(': 1051 open=opts.next_nonwhitespace() 1052 qualifier=open.next_word() 1053 close=opts.next_nonwhitespace() 1054 if qualifier.lower()=='vector': 1055 raise NexusError, 'Unsupported VECTOR format in line %s' % (options) 1056 elif qualifier.lower()!='standard': 1057 raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options) 1058 if opts.next_nonwhitespace()!=separator: 1059 raise NexusError, 'Formatting error in line: %s ' % rest 1060 return name
1061
1062 - def _parse_list(self,options_buffer,set_type):
1063 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1064 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1065 """ 1066 plain_list=[] 1067 if options_buffer.peek_nonwhitespace(): 1068 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1069 while True: 1070 identifier=options_buffer.next_word() # next list element 1071 if not identifier: # end of list? 1072 break 1073 start=self._resolve(identifier,set_type=set_type) 1074 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1075 end=start 1076 step=1 1077 # get hyphen and end of range 1078 hyphen=options_buffer.next_nonwhitespace() 1079 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1080 if set_type==CHARSET: 1081 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1082 backslash=options_buffer.next_nonwhitespace() 1083 step=int(options_buffer.next_word()) # get backslash and step 1084 plain_list.extend(range(start,end+1,step)) 1085 else: 1086 if type(start)==list or type(end)==list: 1087 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier 1088 start=self.taxlabels.index(start) 1089 end=self.taxlabels.index(end) 1090 taxrange=self.taxlabels[start:end+1] 1091 plain_list.extend(taxrange) 1092 else: 1093 if type(start)==list: # start was the name of charset or taxset 1094 plain_list.extend(start) 1095 else: # start was an ordinary identifier 1096 plain_list.append(start) 1097 except NexusError: 1098 raise 1099 except: 1100 return None 1101 return plain_list
1102
1103 - def _resolve(self,identifier,set_type=None):
1104 """Translate identifier in list into character/taxon index. 1105 Characters (which are referred to by their index in Nexus.py): 1106 Plain numbers are returned minus 1 (Nexus indices to python indices) 1107 Text identifiers are translaterd into their indices (if plain character indentifiers), 1108 the first hit in charlabels is returned (charlabels don't need to be unique) 1109 or the range of indices is returned (if names of character sets). 1110 Taxa (which are referred to by their unique name in Nexus.py): 1111 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1112 Names are returned unchanged (if plain taxon identifiers), or the names in 1113 the corresponding taxon set is returned 1114 """ 1115 identifier=quotestrip(identifier) 1116 if not set_type: 1117 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1118 if set_type==CHARSET: 1119 try: 1120 n=int(identifier) 1121 except ValueError: 1122 if self.charlabels and identifier in self.charlabels.values(): 1123 for k in self.charlabels: 1124 if self.charlabels[k]==identifier: 1125 return k 1126 elif self.charsets and identifier in self.charsets: 1127 return self.charsets[identifier] 1128 else: 1129 raise NexusError, 'Unknown character identifier: %s' % identifier 1130 else: 1131 if n<=self.nchar: 1132 return n-1 1133 else: 1134 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar) 1135 elif set_type==TAXSET: 1136 try: 1137 n=int(identifier) 1138 except ValueError: 1139 taxlabels_id=self._check_taxlabels(identifier) 1140 if taxlabels_id: 1141 return taxlabels_id 1142 elif self.taxsets and identifier in self.taxsets: 1143 return self.taxsets[identifier] 1144 else: 1145 raise NexusError, 'Unknown taxon identifier: %s' % identifier 1146 else: 1147 if n>0 and n<=self.ntax: 1148 return self.taxlabels[n-1] 1149 else: 1150 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax) 1151 else: 1152 raise NexusError('Unknown set specification: %s.'% set_type)
1153
1154 - def _stateset(self, options):
1155 #Not implemented 1156 pass
1157
1158 - def _changeset(self, options):
1159 #Not implemented 1160 pass
1161
1162 - def _treeset(self, options):
1163 #Not implemented 1164 pass
1165
1166 - def _treepartition(self, options):
1167 #Not implemented 1168 pass
1169
1170 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1171 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1172 """Writes a nexus file for each partition in charpartition. 1173 Only non-excluded characters and non-deleted taxa are included, just the data block is written. 1174 """ 1175 1176 if not matrix: 1177 matrix=self.matrix 1178 if not matrix: 1179 return 1180 if not filename: 1181 filename=self.filename 1182 if charpartition: 1183 pfilenames={} 1184 for p in charpartition: 1185 total_exclude=[]+exclude 1186 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1187 total_exclude=_make_unique(total_exclude) 1188 pcomment=comment+'\nPartition: '+p+'\n' 1189 dot=filename.rfind('.') 1190 if dot>0: 1191 pfilename=filename[:dot]+'_'+p+'.data' 1192 else: 1193 pfilename=filename+'_'+p 1194 pfilenames[p]=pfilename 1195 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1196 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1197 mrbayes=mrbayes) 1198 return pfilenames 1199 else: 1200 fn=self.filename+'.data' 1201 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1202 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1203 mrbayes=mrbayes) 1204 return fn
1205
1206 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ 1207 blocksize=None, interleave=False, interleave_by_partition=False,\ 1208 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\ 1209 codons_block=True):
1210 """Writes a nexus file with data and sets block to a file or handle. 1211 1212 Character sets and partitions are appended by default, and are 1213 adjusted according to excluded characters (i.e. character sets 1214 still point to the same sites (not necessarily same positions), 1215 without including the deleted characters. 1216 1217 filename - Either a filename as a string (which will be opened, 1218 written to and closed), or a handle object (which will 1219 be written to but NOT closed). 1220 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1221 start of the file is ommited. 1222 1223 Returns the filename/handle used to write the data. 1224 """ 1225 if not matrix: 1226 matrix=self.matrix 1227 if not matrix: 1228 return 1229 if not filename: 1230 filename=self.filename 1231 if [t for t in delete if not self._check_taxlabels(t)]: 1232 raise NexusError, 'Unknown taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels))) 1233 if interleave_by_partition: 1234 if not interleave_by_partition in self.charpartitions: 1235 raise NexusError, 'Unknown partition: '+interleave_by_partition 1236 else: 1237 partition=self.charpartitions[interleave_by_partition] 1238 # we need to sort the partition names by starting position before we exclude characters 1239 names=_sort_keys_by_values(partition) 1240 newpartition={} 1241 for p in partition: 1242 newpartition[p]=[c for c in partition[p] if c not in exclude] 1243 # how many taxa and how many characters are left? 1244 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1245 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1246 ntax_adjusted=len(undelete) 1247 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1248 if not undelete or (undelete and undelete[0]==''): 1249 return 1250 if isinstance(filename,str): 1251 try: 1252 fh=open(filename,'w') 1253 except IOError: 1254 raise NexusError, 'Could not open %s for writing.' % filename 1255 elif hasattr(file, "write"): 1256 fh=filename 1257 else : 1258 raise ValueError, "Neither a filename nor a handle was supplied" 1259 if not omit_NEXUS: 1260 fh.write('#NEXUS\n') 1261 if comment: 1262 fh.write('['+comment+']\n') 1263 fh.write('begin data;\n') 1264 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1265 fh.write('\tformat datatype='+self.datatype) 1266 if self.respectcase: 1267 fh.write(' respectcase') 1268 if self.missing: 1269 fh.write(' missing='+self.missing) 1270 if self.gap: 1271 fh.write(' gap='+self.gap) 1272 if self.matchchar: 1273 fh.write(' matchchar='+self.matchchar) 1274 if self.labels: 1275 fh.write(' labels='+self.labels) 1276 if self.equate: 1277 fh.write(' equate='+self.equate) 1278 if interleave or interleave_by_partition: 1279 fh.write(' interleave') 1280 fh.write(';\n') 1281 #if self.taxlabels: 1282 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1283 if self.charlabels: 1284 newcharlabels=self._adjust_charlabels(exclude=exclude) 1285 clkeys=newcharlabels.keys() 1286 clkeys.sort() 1287 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1288 fh.write('matrix\n') 1289 if not blocksize: 1290 if interleave: 1291 blocksize=70 1292 else: 1293 blocksize=self.nchar 1294 # delete deleted taxa and ecxclude excluded characters... 1295 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1296 if interleave_by_partition: 1297 # interleave by partitions, but adjust partitions with regard to excluded characters 1298 seek=0 1299 for p in names: 1300 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1301 if len(newpartition[p])>0: 1302 for taxon in undelete: 1303 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1304 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1305 fh.write('\n') 1306 else: 1307 fh.write('[empty]\n\n') 1308 seek+=len(newpartition[p]) 1309 elif interleave: 1310 for seek in range(0,nchar_adjusted,blocksize): 1311 for taxon in undelete: 1312 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1313 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1314 fh.write('\n') 1315 else: 1316 for taxon in undelete: 1317 if blocksize<nchar_adjusted: 1318 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1319 else: 1320 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1321 for seek in range(0,nchar_adjusted,blocksize): 1322 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1323 fh.write(';\nend;\n') 1324 if append_sets: 1325 if codons_block: 1326 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False)) 1327 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True)) 1328 else: 1329 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1330 1331 if fh == filename : 1332 #We were given the handle, don't close it. 1333 return filename 1334 else : 1335 #We opened the handle, so we should close it. 1336 fh.close() 1337 return filename
1338
1339 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1340 """Returns a sets block""" 1341 if not self.charsets and not self.taxsets and not self.charpartitions: 1342 return '' 1343 if codons_only: 1344 sets=['\nbegin codons'] 1345 else: 1346 sets=['\nbegin sets'] 1347 # - now if characters have been excluded, the character sets need to be adjusted, 1348 # so that they still point to the right character positions 1349 # calculate a list of offsets: for each deleted character, the following character position 1350 # in the new file will have an additional offset of -1 1351 offset=0 1352 offlist=[] 1353 for c in range(self.nchar): 1354 if c in exclude: 1355 offset+=1 1356 offlist.append(-1) # dummy value as these character positions are excluded 1357 else: 1358 offlist.append(c-offset) 1359 # now adjust each of the character sets 1360 if not codons_only: 1361 for n,ns in self.charsets.items(): 1362 cset=[offlist[c] for c in ns if c not in exclude] 1363 if cset: 1364 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1365 for n,s in self.taxsets.items(): 1366 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1367 if tset: 1368 sets.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1369 for n,p in self.charpartitions.items(): 1370 if not include_codons and n==CODONPOSITIONS: 1371 continue 1372 elif codons_only and n!=CODONPOSITIONS: 1373 continue 1374 # as characters have been excluded, the partitions must be adjusted 1375 # if a partition is empty, it will be omitted from the charpartition command 1376 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1377 names=_sort_keys_by_values(p) 1378 newpartition={} 1379 for sn in names: 1380 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1381 if nsp: 1382 newpartition[sn]=nsp 1383 if newpartition: 1384 if include_codons and n==CODONPOSITIONS: 1385 command='codonposset' 1386 else: 1387 command='charpartition' 1388 sets.append('%s %s = %s' % (command,safename(n),\ 1389 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1390 # now write charpartititions, much easier than charpartitions 1391 for n,p in self.taxpartitions.items(): 1392 names=_sort_keys_by_values(p) 1393 newpartition={} 1394 for sn in names: 1395 nsp=[t for t in p[sn] if t not in delete] 1396 if nsp: 1397 newpartition[sn]=nsp 1398 if newpartition: 1399 sets.append('taxpartition %s = %s' % (safename(n),\ 1400 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1401 # add 'end' and return everything 1402 sets.append('end;\n') 1403 if len(sets)==2: # begin and end only 1404 return '' 1405 else: 1406 return ';\n'.join(sets)
1407
1408 - def export_fasta(self, filename=None, width=70):
1409 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1410 if not filename: 1411 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1412 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1413 else: 1414 filename=self.filename+'.fas' 1415 fh=open(filename,'w') 1416 for taxon in self.taxlabels: 1417 fh.write('>'+safename(taxon)+'\n') 1418 for i in range(0, len(self.matrix[taxon].tostring()), width): 1419 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') 1420 fh.close()
1421
1422 - def export_phylip(self, filename=None):
1423 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1424 if not filename: 1425 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1426 filename='.'.join(self.filename.split('.')[:-1])+'.phy' 1427 else: 1428 filename=self.filename+'.phy' 1429 fh=open(filename,'w') 1430 fh.write('%d %d\n' % (self.ntax,self.nchar)) 1431 for taxon in self.taxlabels: 1432 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring())) 1433 fh.close()
1434
1435 - def constant(self,matrix=None,delete=[],exclude=[]):
1436 """Return a list with all constant characters.""" 1437 if not matrix: 1438 matrix=self.matrix 1439 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1440 if not undelete: 1441 return None 1442 elif len(undelete)==1: 1443 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1444 # get the first sequence and expand all ambiguous values 1445 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1446 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] 1447 for taxon in undelete[1:]: 1448 newconstant=[] 1449 for site in constant: 1450 #print '%d (paup=%d)' % (site[0],site[0]+1), 1451 seqsite=matrix[taxon][site[0]].upper() 1452 #print seqsite,'checked against',site[1],'\t', 1453 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1454 # missing or same as before -> ok 1455 newconstant.append(site) 1456 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1457 # subset of an ambig or only missing in previous -> take subset 1458 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1459 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1460 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1])) 1461 if intersect: 1462 newconstant.append((site[0],''.join(intersect))) 1463 # print 'ok' 1464 #else: 1465 # print 'failed' 1466 #else: 1467 # print 'failed' 1468 constant=newconstant 1469 cpos=[s[0] for s in constant] 1470 return constant
1471 # return [x[0] for x in constant] 1472
1473 - def cstatus(self,site,delete=[],narrow=True):
1474 """Summarize character. 1475 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1476 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1477 """ 1478 undelete=[t for t in self.taxlabels if t not in delete] 1479 if not undelete: 1480 return None 1481 cstatus=[] 1482 for t in undelete: 1483 c=self.matrix[t][site].upper() 1484 if self.options.get('gapmode')=='missing' and c==self.gap: 1485 c=self.missing 1486 if narrow and c==self.missing: 1487 if c not in cstatus: 1488 cstatus.append(c) 1489 else: 1490 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1491 if self.missing in cstatus and narrow and len(cstatus)>1: 1492 cstatus=[c for c in cstatus if c!=self.missing] 1493 cstatus.sort() 1494 return cstatus
1495
1496 - def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1497 """Calculates a stepmatrix for weighted parsimony. 1498 See Wheeler (1990), Cladistics 6:269-275 and 1499 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1500 """ 1501 m=StepMatrix(self.unambiguous_letters,self.gap) 1502 for site in [s for s in range(self.nchar) if s not in exclude]: 1503 cstatus=self.cstatus(site,delete) 1504 for i,b1 in enumerate(cstatus[:-1]): 1505 for b2 in cstatus[i+1:]: 1506 m.add(b1.upper(),b2.upper(),1) 1507 return m.transformation().weighting().smprint(name=name)
1508
1509 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1510 """Return a matrix without deleted taxa and excluded characters.""" 1511 if not matrix: 1512 matrix=self.matrix 1513 if [t for t in delete if not self._check_taxlabels(t)]: 1514 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels)) 1515 if exclude!=[]: 1516 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1517 if not undelete: 1518 return {} 1519 m=[matrix[k].tostring() for k in undelete] 1520 zipped_m=zip(*m) 1521 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] 1522 if sitesm==[]: 1523 return dict([(t,Seq('',self.alphabet)) for t in undelete]) 1524 else: 1525 zipped_sitesm=zip(*sitesm) 1526 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] 1527 return dict(zip(undelete,m)) 1528 else: 1529 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1530
1531 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1532 """Return a bootstrapped matrix.""" 1533 if not matrix: 1534 matrix=self.matrix 1535 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects 1536 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out 1537 if not cm: # everything deleted? 1538 return {} 1539 elif len(cm[cm.keys()[0]])==0: # everything excluded? 1540 return cm 1541 undelete=[t for t in self.taxlabels if t in cm] 1542 if seqobjects: 1543 sitesm=zip(*[cm[t].tostring() for t in undelete]) 1544 alphabet=matrix[matrix.keys()[0]].alphabet 1545 else: 1546 sitesm=zip(*[cm[t] for t in undelete]) 1547 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] 1548 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) 1549 if seqobjects: 1550 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] 1551 return dict(zip(undelete,bootstrapseqs))
1552
1553 - def add_sequence(self,name,sequence):
1554 """Adds a sequence to the matrix.""" 1555 1556 if not name: 1557 raise NexusError, 'New sequence must have a name' 1558 1559 diff=self.nchar-len(sequence) 1560 if diff<0: 1561 self.insert_gap(self.nchar,-diff) 1562 elif diff>0: 1563 sequence+=self.missing*diff 1564 1565 if name in self.taxlabels: 1566 unique_name=_unique_label(self.taxlabels,name) 1567 print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name) 1568 else: 1569 unique_name=name 1570 1571 assert unique_name not in self.matrix.keys(), "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug." 1572 1573 self.matrix[unique_name]=Seq(sequence,self.alphabet) 1574 self.ntax+=1 1575 self.taxlabels.append(unique_name) 1576 self.unaltered_taxlabels.append(name)
1577
1578 - def insert_gap(self,pos,n=1,leftgreedy=False):
1579 """Add a gap into the matrix and adjust charsets and partitions. 1580 1581 pos=0: first position 1582 pos=nchar: last position 1583 """ 1584 1585 def _adjust(set,x,d,leftgreedy=False): 1586 """Adjusts chartacter sets if gaps are inserted, taking care of 1587 new gaps within a coherent character set.""" 1588 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 1589 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 1590 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 1591 set.sort() 1592 addpos=0 1593 for i,c in enumerate(set): 1594 if c>=x: 1595 set[i]=c+d 1596 # if we add gaps within a group of characters, we want the gap position included in this group 1597 if c==x: 1598 if leftgreedy or (i>0 and set[i-1]==c-1): 1599 addpos=i 1600 if addpos>0: 1601 set[addpos:addpos]=range(x,x+d) 1602 return set
1603 1604 if pos<0 or pos>self.nchar: 1605 raise NexusError('Illegal gap position: %d' % pos) 1606 if n==0: 1607 return 1608 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1609 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n 1610 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ 1611 # i,taxon in enumerate(self.taxlabels)]) 1612 zipped=zip(*sitesm) 1613 mapped=map(''.join,zipped) 1614 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] 1615 self.matrix=dict(listed) 1616 self.nchar+=n 1617 # now adjust character sets 1618 for i,s in self.charsets.items(): 1619 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1620 for p in self.charpartitions: 1621 for sp,s in self.charpartitions[p].items(): 1622 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1623 # now adjust character state labels 1624 self.charlabels=self._adjust_charlabels(insert=[pos]*n) 1625 return self.charlabels 1626
1627 - def _adjust_charlabels(self,exclude=None,insert=None):
1628 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" 1629 if exclude and insert: 1630 raise NexusError, 'Can\'t exclude and insert at the same time' 1631 if not self.charlabels: 1632 return None 1633 labels=self.charlabels.keys() 1634 labels.sort() 1635 newcharlabels={} 1636 if exclude: 1637 exclude.sort() 1638 exclude.append(sys.maxint) 1639 excount=0 1640 for c in labels: 1641 if not c in exclude: 1642 while c>exclude[excount]: 1643 excount+=1 1644 newcharlabels[c-excount]=self.charlabels[c] 1645 elif insert: 1646 insert.sort() 1647 insert.append(sys.maxint) 1648 icount=0 1649 for c in labels: 1650 while c>=insert[icount]: 1651 icount+=1 1652 newcharlabels[c+icount]=self.charlabels[c] 1653 else: 1654 return self.charlabels 1655 return newcharlabels
1656
1657 - def invert(self,charlist):
1658 """Returns all character indices that are not in charlist.""" 1659 return [c for c in range(self.nchar) if c not in charlist]
1660
1661 - def gaponly(self,include_missing=False):
1662 """Return gap-only sites.""" 1663 gap=sets.Set(self.gap) 1664 if include_missing: 1665 gap.add(self.missing) 1666 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1667 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)] 1668 return gaponly
1669
1670 - def terminal_gap_to_missing(self,missing=None,skip_n=True):
1671 """Replaces all terminal gaps with missing character. 1672 1673 Mixtures like ???------??------- are properly resolved.""" 1674 1675 if not missing: 1676 missing=self.missing 1677 replace=[self.missing,self.gap] 1678 if not skip_n: 1679 replace.extend(['n','N']) 1680 for taxon in self.taxlabels: 1681 sequence=self.matrix[taxon].tostring() 1682 length=len(sequence) 1683 start,end=get_start_end(sequence,skiplist=replace) 1684 if start==-1 and end==-1: 1685 sequence=missing*length 1686 else: 1687 sequence=sequence[:end+1]+missing*(length-end-1) 1688 sequence=start*missing+sequence[start:] 1689 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon 1690 self.matrix[taxon]=Seq(sequence,self.alphabet)
1691