Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12   
  13  import os,sys, math, random, copy 
  14   
  15  from Bio.Alphabet import IUPAC 
  16  from Bio.Data import IUPACData 
  17  from Bio.Seq import Seq 
  18   
  19  from Trees import Tree,NodeData 
  20   
  21  INTERLEAVE=70 
  22  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ 
  23          'matrix','tree', 'utree','translate','codonposset','title'] 
  24  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons'] 
  25  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  26  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  27  WHITESPACE=' \t\n' 
  28  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  29  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  30  CHARSET='chars' 
  31  TAXSET='taxa' 
  32  CODONPOSITIONS='codonpositions' 
  33  DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
34 -class NexusError(Exception): pass
35
36 -class CharBuffer(object):
37 """Helps reading NEXUS-words and characters from a buffer."""
38 - def __init__(self,string):
39 if string: 40 self.buffer=list(string) 41 else: 42 self.buffer=[]
43
44 - def peek(self):
45 if self.buffer: 46 return self.buffer[0] 47 else: 48 return None
49
50 - def peek_nonwhitespace(self):
51 b=''.join(self.buffer).strip() 52 if b: 53 return b[0] 54 else: 55 return None
56
57 - def next(self):
58 if self.buffer: 59 return self.buffer.pop(0) 60 else: 61 return None
62
63 - def next_nonwhitespace(self):
64 while True: 65 p=self.next() 66 if p is None: 67 break 68 if p not in WHITESPACE: 69 return p 70 return None
71
72 - def skip_whitespace(self):
73 while self.buffer[0] in WHITESPACE: 74 self.buffer=self.buffer[1:]
75
76 - def next_until(self,target):
77 for t in target: 78 try: 79 pos=self.buffer.index(t) 80 except ValueError: 81 pass 82 else: 83 found=''.join(self.buffer[:pos]) 84 self.buffer=self.buffer[pos:] 85 return found 86 else: 87 return None
88
89 - def peek_word(self,word):
90 return ''.join(self.buffer[:len(word)])==word
91
92 - def next_word(self):
93 """Return the next NEXUS word from a string. 94 95 This deals with single and double quotes, whitespace and punctuation. 96 """ 97 98 word=[] 99 quoted=False 100 first=self.next_nonwhitespace() # get first character 101 if not first: # return empty if only whitespace left 102 return None 103 word.append(first) 104 if first=="'": # word starts with a quote 105 quoted="'" 106 elif first=='"': 107 quoted='"' 108 elif first in PUNCTUATION: # if it's punctuation, return immediately 109 return first 110 while True: 111 c=self.peek() 112 if c==quoted: # a quote? 113 word.append(self.next()) # store quote 114 if self.peek()==quoted: # double quote 115 skip=self.next() # skip second quote 116 elif quoted: # second single quote ends word 117 break 118 elif quoted: 119 word.append(self.next()) # if quoted, then add anything 120 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 121 break 122 else: 123 word.append(self.next()) # standard character 124 return ''.join(word)
125
126 - def rest(self):
127 """Return the rest of the string without parsing.""" 128 return ''.join(self.buffer)
129
130 -class StepMatrix(object):
131 """Calculate a stepmatrix for weighted parsimony. 132 133 See Wheeler (1990), Cladistics 6:269-275. 134 """ 135
136 - def __init__(self,symbols,gap):
137 self.data={} 138 self.symbols=[s for s in symbols] 139 self.symbols.sort() 140 if gap: 141 self.symbols.append(gap) 142 for x in self.symbols: 143 for y in [s for s in self.symbols if s!=x]: 144 self.set(x,y,0)
145
146 - def set(self,x,y,value):
147 if x>y: 148 x,y=y,x 149 self.data[x+y]=value
150
151 - def add(self,x,y,value):
152 if x>y: 153 x,y=y,x 154 self.data[x+y]+=value
155
156 - def sum(self):
157 return reduce(lambda x,y:x+y,self.data.values())
158
159 - def transformation(self):
160 total=self.sum() 161 if total!=0: 162 for k in self.data: 163 self.data[k]=self.data[k]/float(total) 164 return self
165
166 - def weighting(self):
167 for k in self.data: 168 if self.data[k]!=0: 169 self.data[k]=-math.log(self.data[k]) 170 return self
171
172 - def smprint(self,name='your_name_here'):
173 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 174 matrix+=' %s\n' % ' '.join(self.symbols) 175 for x in self.symbols: 176 matrix+='[%s]'.ljust(8) % x 177 for y in self.symbols: 178 if x==y: 179 matrix+=' . ' 180 else: 181 if x>y: 182 x1,y1=y,x 183 else: 184 x1,y1=x,y 185 if self.data[x1+y1]==0: 186 matrix+='inf. ' 187 else: 188 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 189 matrix+='\n' 190 matrix+=';\n' 191 return matrix
192
193 -def safename(name,mrbayes=False):
194 """Return a taxon identifier according to NEXUS standard. 195 196 Wrap quotes around names with punctuation or whitespace, and double 197 single quotes. 198 199 mrbayes=True: write names without quotes, whitespace or punctuation 200 for the mrbayes software package. 201 """ 202 if mrbayes: 203 safe=name.replace(' ','_') 204 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 205 else: 206 safe=name.replace("'","''") 207 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 208 safe="'"+safe+"'" 209 return safe
210
211 -def quotestrip(word):
212 """Remove quotes and/or double quotes around identifiers.""" 213 if not word: 214 return None 215 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 216 word=word[1:-1] 217 return word
218
219 -def get_start_end(sequence, skiplist=['-','?']):
220 """Return position of first and last character which is not in skiplist. 221 222 Skiplist defaults to ['-','?']).""" 223 224 length=len(sequence) 225 if length==0: 226 return None,None 227 end=length-1 228 while end>=0 and (sequence[end] in skiplist): 229 end-=1 230 start=0 231 while start<length and (sequence[start] in skiplist): 232 start+=1 233 if start==length and end==-1: # empty sequence 234 return -1,-1 235 else: 236 return start,end
237
238 -def _sort_keys_by_values(p):
239 """Returns a sorted list of keys of p sorted by values of p.""" 240 startpos=[(p[pn],pn) for pn in p if p[pn]] 241 startpos.sort() 242 # parenthisis added because of py3k 243 return (zip(*startpos))[1]
244
245 -def _make_unique(l):
246 """Check that all values in list are unique and return a pruned and sorted list.""" 247 l=list(set(l)) 248 l.sort() 249 return l
250
251 -def _unique_label(previous_labels,label):
252 """Returns a unique name if label is already in previous_labels.""" 253 while label in previous_labels: 254 if label.split('.')[-1].startswith('copy'): 255 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1) 256 else: 257 label+='.copy' 258 return label
259
260 -def _seqmatrix2strmatrix(matrix):
261 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 262 return dict([(t,matrix[t].tostring()) for t in matrix])
263
264 -def _compact4nexus(orig_list):
265 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 266 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 267 """ 268 269 if not orig_list: 270 return '' 271 orig_list=list(set(orig_list)) 272 orig_list.sort() 273 shortlist=[] 274 clist=orig_list[:] 275 clist.append(clist[-1]+.5) # dummy value makes it easier 276 while len(clist)>1: 277 step=1 278 for i,x in enumerate(clist): 279 if x==clist[0]+i*step: # are we still in the right step? 280 continue 281 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 282 # second element, and possibly at least 3 elements to link, 283 # and the next one is in the right step 284 step=x-clist[0] 285 else: # pattern broke, add all values before current position to new list 286 sub=clist[:i] 287 if len(sub)==1: 288 shortlist.append(str(sub[0]+1)) 289 else: 290 if step==1: 291 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 292 else: 293 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 294 clist=clist[i:] 295 break 296 return ' '.join(shortlist)
297
298 -def combine(matrices):
299 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 300 301 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 302 Character sets, character partitions and taxon sets are prefixed, readjusted 303 and present in the combined matrix. 304 """ 305 306 if not matrices: 307 return None 308 name=matrices[0][0] 309 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 310 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1) 311 if mixed_datatypes: 312 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 313 # raise NexusError('Matrices must be of same datatype') 314 combined.charlabels=None 315 combined.statelabels=None 316 combined.interleave=False 317 combined.translate=None 318 319 # rename taxon sets and character sets and name them with prefix 320 for cn,cs in combined.charsets.iteritems(): 321 combined.charsets['%s.%s' % (name,cn)]=cs 322 del combined.charsets[cn] 323 for tn,ts in combined.taxsets.iteritems(): 324 combined.taxsets['%s.%s' % (name,tn)]=ts 325 del combined.taxsets[tn] 326 # previous partitions usually don't make much sense in combined matrix 327 # just initiate one new partition parted by single matrices 328 combined.charpartitions={'combined':{name:range(combined.nchar)}} 329 for n,m in matrices[1:]: # add all other matrices 330 both=[t for t in combined.taxlabels if t in m.taxlabels] 331 combined_only=[t for t in combined.taxlabels if t not in both] 332 m_only=[t for t in m.taxlabels if t not in both] 333 for t in both: 334 # concatenate sequences and unify gap and missing character symbols 335 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 336 # replace date of missing taxa with symbol for missing data 337 for t in combined_only: 338 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 339 for t in m_only: 340 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 341 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 342 combined.taxlabels.extend(m_only) # new taxon list 343 for cn,cs in m.charsets.iteritems(): # adjust character sets for new matrix 344 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 345 if m.taxsets: 346 if not combined.taxsets: 347 combined.taxsets={} 348 # update taxon sets 349 combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) \ 350 for tn,ts in m.taxsets.iteritems())) 351 # update new charpartition 352 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) 353 # update charlabels 354 if m.charlabels: 355 if not combined.charlabels: 356 combined.charlabels={} 357 combined.charlabels.update(dict((combined.nchar+i,label) \ 358 for (i,label) in m.charlabels.iteritems())) 359 combined.nchar+=m.nchar # update nchar and ntax 360 combined.ntax+=len(m_only) 361 362 # some prefer partitions, some charsets: 363 # make separate charset for ecah initial dataset 364 for c in combined.charpartitions['combined']: 365 combined.charsets[c]=combined.charpartitions['combined'][c] 366 367 return combined
368
369 -def _kill_comments_and_break_lines(text):
370 """Delete []-delimited comments out of a file and break into lines separated by ';'. 371 372 stripped_text=_kill_comments_and_break_lines(text): 373 Nested and multiline comments are allowed. [ and ] symbols within single 374 or double quotes are ignored, newline ends a quote, all symbols with quotes are 375 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 376 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 377 Quotes inside special [& and [\ are treated as normal characters, 378 but no nesting inside these special comments allowed (like [& [\ ]]). 379 ';' ist deleted from end of line. 380 381 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 382 """ 383 contents=CharBuffer(text) 384 newtext=[] 385 newline=[] 386 quotelevel='' 387 speciallevel=False 388 commlevel=0 389 while True: 390 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character 391 #if not plain: 392 # newline.append(contents.rest) # not found, just add the rest 393 # break 394 #newline.append(plain) # add intermediate text 395 t=contents.next() # and get special character 396 if t is None: 397 break 398 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 399 quotelevel='' 400 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 401 quotelevel=t 402 elif not quotelevel and t=='[': # opening bracket outside a quote 403 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 404 speciallevel=True 405 else: 406 commlevel+=1 407 elif not quotelevel and t==']': # closing bracket ioutside a quote 408 if speciallevel: 409 speciallevel=False 410 else: 411 commlevel-=1 412 if commlevel<0: 413 raise NexusError('Nexus formatting error: unmatched ]') 414 continue 415 if commlevel==0: # copy if we're not in comment 416 if t==';' and not quotelevel: 417 newtext.append(''.join(newline)) 418 newline=[] 419 else: 420 newline.append(t) 421 #level of comments should be 0 at the end of the file 422 if newline: 423 newtext.append('\n'.join(newline)) 424 if commlevel>0: 425 raise NexusError('Nexus formatting error: unmatched [') 426 return newtext
427 428
429 -def _adjust_lines(lines):
430 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 431 432 list_of_commandlines=_adjust_lines(input_text) 433 Lines are adjusted so that no linebreaks occur within a commandline 434 (except matrix command line) 435 """ 436 formatted_lines=[] 437 for l in lines: 438 #Convert line endings 439 l=l.replace('\r\n','\n').replace('\r','\n').strip() 440 if l.lower().startswith('matrix'): 441 formatted_lines.append(l) 442 else: 443 l=l.replace('\n',' ') 444 if l: 445 formatted_lines.append(l) 446 return formatted_lines
447
448 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
449 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 450 451 opening=seq.find('(') 452 while opening>-1: 453 closing=seq.find(')') 454 if closing<0: 455 raise NexusError('Missing closing parenthesis in: '+seq) 456 elif closing<opening: 457 raise NexusError('Missing opening parenthesis in: '+seq) 458 ambig=[x for x in seq[opening+1:closing]] 459 ambig.sort() 460 ambig=''.join(ambig) 461 ambig_code=rev_ambig_values[ambig.upper()] 462 if ambig!=ambig.upper(): 463 ambig_code=ambig_code.lower() 464 seq=seq[:opening]+ambig_code+seq[closing+1:] 465 opening=seq.find('(') 466 return seq
467
468 -class Commandline(object):
469 """Represent a commandline as command and options.""" 470
471 - def __init__(self, line, title):
472 self.options={} 473 options=[] 474 self.command=None 475 try: 476 #Assume matrix (all other command lines have been stripped of \n) 477 self.command, options = line.strip().split('\n', 1) 478 except ValueError: #Not matrix 479 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 480 self.command=line.split()[0] 481 options=' '.join(line.split()[1:]) 482 self.command = self.command.strip().lower() 483 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 484 self.options=options.strip() 485 else: 486 if len(options) > 0: 487 try: 488 options = options.replace('=', ' = ').split() 489 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 490 indices = [] 491 for sl in valued_indices: 492 indices.extend(sl) 493 token_indices = [n for n in range(len(options)) if n not in indices] 494 for opt in valued_indices: 495 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 496 self.options[options[opt[0]].lower()] = options[opt[2]] 497 for token in token_indices: 498 self.options[options[token].lower()] = None 499 except ValueError: 500 raise NexusError('Incorrect formatting in line: %s' % line)
501
502 -class Block(object):
503 """Represent a NEXUS block with block name and list of commandlines."""
504 - def __init__(self,title=None):
505 self.title=title 506 self.commandlines=[]
507
508 -class Nexus(object):
509 510 __slots__=['original_taxon_order','__dict__'] 511
512 - def __init__(self, input=None):
513 self.ntax=0 # number of taxa 514 self.nchar=0 # number of characters 515 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.) 516 self.taxlabels=[] # labels for taxa, ordered by their id 517 self.charlabels=None # ... and for characters 518 self.statelabels=None # ... and for states 519 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 520 self.respectcase=False # case sensitivity 521 self.missing='?' # symbol for missing characters 522 self.gap='-' # symbol for gap 523 self.symbols=None # set of symbols 524 self.equate=None # set of symbol synonyms 525 self.matchchar=None # matching char for matrix representation 526 self.labels=None # left, right, no 527 self.transpose=False # whether matrix is transposed 528 self.interleave=False # whether matrix is interleaved 529 self.tokens=False # unsupported 530 self.eliminate=None # unsupported 531 self.matrix=None # ... 532 self.unknown_blocks=[] # blocks we don't care about 533 self.taxsets={} 534 self.charsets={} 535 self.charpartitions={} 536 self.taxpartitions={} 537 self.trees=[] # list of Trees (instances of Tree class) 538 self.translate=None # Dict to translate taxon <-> taxon numbers 539 self.structured=[] # structured input representation 540 self.set={} # dict of the set command to set various options 541 self.options={} # dict of the options command in the data block 542 self.codonposset=None # name of the charpartition that defines codon positions 543 544 # some defaults 545 self.options['gapmode']='missing' 546 547 if input: 548 self.read(input) 549 else: 550 self.read(DEFAULTNEXUS)
551
552 - def get_original_taxon_order(self):
553 """Included for backwards compatibility (DEPRECATED).""" 554 return self.taxlabels
555 - def set_original_taxon_order(self,value):
556 """Included for backwards compatibility (DEPRECATED).""" 557 self.taxlabels=value
558 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 559
560 - def read(self,input):
561 """Read and parse NEXUS imput (a filename, file-handle, or string).""" 562 563 # 1. Assume we have the name of a file in the execution dir 564 # Note we need to add parsing of the path to dir/filename 565 try: 566 file_contents = open(os.path.expanduser(input),'rU').read() 567 self.filename=input 568 except (TypeError,IOError,AttributeError): 569 #2 Assume we have a string from a fh.read() 570 if isinstance(input, str): 571 file_contents = input 572 self.filename='input_string' 573 #3 Assume we have a file object 574 elif hasattr(input,'read'): # file objects or StringIO objects 575 file_contents=input.read() 576 if hasattr(input,"name") and input.name: 577 self.filename=input.name 578 else: 579 self.filename='Unknown_nexus_file' 580 else: 581 print input.strip()[:50] 582 raise NexusError('Unrecognized input: %s ...' % input[:100]) 583 file_contents=file_contents.strip() 584 if file_contents.startswith('#NEXUS'): 585 file_contents=file_contents[6:] 586 commandlines=_get_command_lines(file_contents) 587 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 588 for i,cl in enumerate(commandlines): 589 try: 590 if cl[:6].upper()=='#NEXUS': 591 commandlines[i]=cl[6:].strip() 592 except: 593 pass 594 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 595 nexus_block_gen = self._get_nexus_block(commandlines) 596 while 1: 597 try: 598 title, contents = nexus_block_gen.next() 599 except StopIteration: 600 break 601 if title in KNOWN_NEXUS_BLOCKS: 602 self._parse_nexus_block(title, contents) 603 else: 604 self._unknown_nexus_block(title, contents)
605
606 - def _get_nexus_block(self,file_contents):
607 """Generator for looping through Nexus blocks.""" 608 inblock=False 609 blocklines=[] 610 while file_contents: 611 cl=file_contents.pop(0) 612 if cl.lower().startswith('begin'): 613 if not inblock: 614 inblock=True 615 title=cl.split()[1].lower() 616 else: 617 raise NexusError('Illegal block nesting in block %s' % title) 618 elif cl.lower().startswith('end'): 619 if inblock: 620 inblock=False 621 yield title,blocklines 622 blocklines=[] 623 else: 624 raise NexusError('Unmatched \'end\'.') 625 elif inblock: 626 blocklines.append(cl)
627
628 - def _unknown_nexus_block(self,title, contents):
629 block = Block() 630 block.commandlines.append(contents) 631 block.title = title 632 self.unknown_blocks.append(block)
633
634 - def _parse_nexus_block(self,title, contents):
635 """Parse a known Nexus Block (PRIVATE).""" 636 # attached the structered block representation 637 self._apply_block_structure(title, contents) 638 #now check for taxa,characters,data blocks. If this stuff is defined more than once 639 #the later occurences will override the previous ones. 640 block=self.structured[-1] 641 for line in block.commandlines: 642 try: 643 getattr(self,'_'+line.command)(line.options) 644 except AttributeError: 645 raise 646 raise NexusError('Unknown command: %s ' % line.command)
647
648 - def _title(self,options):
649 pass
650
651 - def _dimensions(self,options):
652 if 'ntax' in options: 653 self.ntax=eval(options['ntax']) 654 if 'nchar' in options: 655 self.nchar=eval(options['nchar'])
656
657 - def _format(self,options):
658 # print options 659 # we first need to test respectcase, then symbols (which depends on respectcase) 660 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 661 # dicts for ambiguous values and alphabet 662 if 'respectcase' in options: 663 self.respectcase=True 664 # adjust symbols to for respectcase 665 if 'symbols' in options: 666 self.symbols=options['symbols'] 667 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 668 (self.symbold.startswith("'") and self.symbols.endswith("'")): 669 self.symbols=self.symbols[1:-1].replace(' ','') 670 if not self.respectcase: 671 self.symbols=self.symbols.lower()+self.symbols.upper() 672 self.symbols=list(set(self.symbols)) 673 if 'datatype' in options: 674 self.datatype=options['datatype'].lower() 675 if self.datatype=='dna' or self.datatype=='nucleotide': 676 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna) 677 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values) 678 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters) 679 elif self.datatype=='rna': 680 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna) 681 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values) 682 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters) 683 elif self.datatype=='protein': 684 self.alphabet=copy.deepcopy(IUPAC.protein) 685 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it 686 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon 687 elif self.datatype=='standard': 688 raise NexusError('Datatype standard is not yet supported.') 689 #self.alphabet=None 690 #self.ambiguous_values={} 691 #if not self.symbols: 692 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 693 #self.unambiguous_letters=self.symbols 694 else: 695 raise NexusError('Unsupported datatype: '+self.datatype) 696 self.valid_characters=''.join(self.ambiguous_values)+self.unambiguous_letters 697 if not self.respectcase: 698 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 699 #we have to sort the reverse ambig coding dict key characters: 700 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 701 rev=dict((i[1],i[0]) for i in self.ambiguous_values.iteritems() if i[0]!='X') 702 self.rev_ambiguous_values={} 703 for (k,v) in rev.iteritems(): 704 key=[c for c in k] 705 key.sort() 706 self.rev_ambiguous_values[''.join(key)]=v 707 #overwrite symbols for datype rna,dna,nucleotide 708 if self.datatype in ['dna','rna','nucleotide']: 709 self.symbols=self.alphabet.letters 710 if self.missing not in self.ambiguous_values: 711 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 712 self.ambiguous_values[self.gap]=self.gap 713 elif self.datatype=='standard': 714 if not self.symbols: 715 self.symbols=['1','0'] 716 if 'missing' in options: 717 self.missing=options['missing'][0] 718 if 'gap' in options: 719 self.gap=options['gap'][0] 720 if 'equate' in options: 721 self.equate=options['equate'] 722 if 'matchchar' in options: 723 self.matchchar=options['matchchar'][0] 724 if 'labels' in options: 725 self.labels=options['labels'] 726 if 'transpose' in options: 727 raise NexusError('TRANSPOSE is not supported!') 728 self.transpose=True 729 if 'interleave' in options: 730 if options['interleave']==None or options['interleave'].lower()=='yes': 731 self.interleave=True 732 if 'tokens' in options: 733 self.tokens=True 734 if 'notokens' in options: 735 self.tokens=False
736 737
738 - def _set(self,options):
739 self.set=options;
740
741 - def _options(self,options):
742 self.options=options;
743
744 - def _eliminate(self,options):
745 self.eliminate=options
746
747 - def _taxlabels(self,options):
748 """Get taxon labels (PRIVATE). 749 750 As the taxon names are already in the matrix, this is superfluous 751 except for transpose matrices, which are currently unsupported anyway. 752 Thus, we ignore the taxlabels command to make handling of duplicate 753 taxon names easier. 754 """ 755 pass
756 #self.taxlabels=[] 757 #opts=CharBuffer(options) 758 #while True: 759 # taxon=quotestrip(opts.next_word()) 760 # if not taxon: 761 # break 762 # self.taxlabels.append(taxon) 763
764 - def _check_taxlabels(self,taxon):
765 """Check for presence of taxon in self.taxlabels.""" 766 # According to NEXUS standard, underscores shall be treated as spaces..., 767 # so checking for identity is more difficult 768 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 769 nexid=taxon.replace(' ','_') 770 return nextaxa.get(nexid)
771
772 - def _charlabels(self,options):
773 self.charlabels={} 774 opts=CharBuffer(options) 775 while True: 776 try: 777 # get id and state 778 w=opts.next_word() 779 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 780 break 781 identifier=self._resolve(w,set_type=CHARSET) 782 state=quotestrip(opts.next_word()) 783 self.charlabels[identifier]=state 784 # check for comma or end of command 785 c=opts.next_nonwhitespace() 786 if c is None: 787 break 788 elif c!=',': 789 raise NexusError('Missing \',\' in line %s.' % options) 790 except NexusError: 791 raise 792 except: 793 raise NexusError('Format error in line %s.' % options)
794
795 - def _charstatelabels(self,options):
796 # warning: charstatelabels supports only charlabels-syntax! 797 self._charlabels(options)
798
799 - def _statelabels(self,options):
800 #self.charlabels=options 801 #print 'Command statelabels is not supported and will be ignored.' 802 pass
803
804 - def _matrix(self,options):
805 if not self.ntax or not self.nchar: 806 raise NexusError('Dimensions must be specified before matrix!') 807 self.matrix={} 808 taxcount=0 809 first_matrix_block=True 810 811 #eliminate empty lines and leading/trailing whitespace 812 lines=[l.strip() for l in options.split('\n') if l.strip()!=''] 813 lineiter=iter(lines) 814 while 1: 815 try: 816 l=lineiter.next() 817 except StopIteration: 818 if taxcount<self.ntax: 819 raise NexusError('Not enough taxa in matrix.') 820 elif taxcount>self.ntax: 821 raise NexusError('Too many taxa in matrix.') 822 else: 823 break 824 # count the taxa and check for interleaved matrix 825 taxcount+=1 826 ##print taxcount 827 if taxcount>self.ntax: 828 if not self.interleave: 829 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 830 else: 831 taxcount=1 832 first_matrix_block=False 833 #get taxon name and sequence 834 linechars=CharBuffer(l) 835 id=quotestrip(linechars.next_word()) 836 l=linechars.rest().strip() 837 chars='' 838 if self.interleave: 839 #interleaved matrix 840 #print 'In interleave' 841 if l: 842 chars=''.join(l.split()) 843 else: 844 chars=''.join(lineiter.next().split()) 845 else: 846 #non-interleaved matrix 847 chars=''.join(l.split()) 848 while len(chars)<self.nchar: 849 l=lineiter.next() 850 chars+=''.join(l.split()) 851 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 852 #first taxon has the reference sequence if matchhar is used 853 if taxcount==1: 854 refseq=iupac_seq 855 else: 856 if self.matchchar: 857 while 1: 858 p=iupac_seq.tostring().find(self.matchchar) 859 if p==-1: 860 break 861 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) 862 #check for invalid characters 863 for i,c in enumerate(iupac_seq.tostring()): 864 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 865 raise NexusError( \ 866 ('Taxon %s: Illegal character %s in sequence %s ' + \ 867 '(check dimensions/interleaving)') % (id,c, iupac_seq)) 868 #add sequence to matrix 869 if first_matrix_block: 870 self.unaltered_taxlabels.append(id) 871 id=_unique_label(self.matrix.keys(),id) 872 self.matrix[id]=iupac_seq 873 self.taxlabels.append(id) 874 else: 875 # taxon names need to be in the same order in each interleaved block 876 id=_unique_label(self.taxlabels[:taxcount-1],id) 877 taxon_present=self._check_taxlabels(id) 878 if taxon_present: 879 self.matrix[taxon_present]+=iupac_seq 880 else: 881 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id) 882 #check all sequences for length according to nchar 883 for taxon in self.matrix: 884 if len(self.matrix[taxon])!=self.nchar: 885 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \ 886 % (self.nchar, len(self.matrix[taxon]),taxon)) 887 #check that taxlabels is identical with matrix.keys. If not, it's a problem 888 matrixkeys=sorted(self.matrix) 889 taxlabelssort=self.taxlabels[:] 890 taxlabelssort.sort() 891 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
892
893 - def _translate(self,options):
894 self.translate={} 895 opts=CharBuffer(options) 896 while True: 897 try: 898 # get id and state 899 identifier=int(opts.next_word()) 900 label=quotestrip(opts.next_word()) 901 self.translate[identifier]=label 902 # check for comma or end of command 903 c=opts.next_nonwhitespace() 904 if c is None: 905 break 906 elif c!=',': 907 raise NexusError('Missing \',\' in line %s.' % options) 908 except NexusError: 909 raise 910 except: 911 raise NexusError('Format error in line %s.' % options)
912
913 - def _utree(self,options):
914 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 915 self._tree(options)
916
917 - def _tree(self,options):
918 opts=CharBuffer(options) 919 if opts.peek_nonwhitespace()=='*': # a star can be used to make it the default tree in some software packages 920 dummy=opts.next_nonwhitespace() 921 name=opts.next_word() 922 if opts.next_nonwhitespace()!='=': 923 raise NexusError('Syntax error in tree description: %s' \ 924 % options[:50]) 925 rooted=False 926 weight=1.0 927 while opts.peek_nonwhitespace()=='[': 928 open=opts.next_nonwhitespace() 929 symbol=opts.next() 930 if symbol!='&': 931 raise NexusError('Illegal special comment [%s...] in tree description: %s' \ 932 % (symbol, options[:50])) 933 special=opts.next() 934 value=opts.next_until(']') 935 closing=opts.next() 936 if special=='R': 937 rooted=True 938 elif special=='U': 939 rooted=False 940 elif special=='W': 941 weight=float(value) 942 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 943 # if there's an active translation table, translate 944 if self.translate: 945 for n in tree.get_terminals(): 946 try: 947 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 948 except (ValueError,KeyError): 949 raise NexusError('Unable to substitue %s using \'translate\' data.' \ 950 % tree.node(n).data.taxon) 951 self.trees.append(tree)
952
953 - def _apply_block_structure(self,title,lines):
954 block=Block('') 955 block.title = title 956 for line in lines: 957 block.commandlines.append(Commandline(line, title)) 958 self.structured.append(block)
959
960 - def _taxset(self, options):
961 name,taxa=self._get_indices(options,set_type=TAXSET) 962 self.taxsets[name]=_make_unique(taxa)
963
964 - def _charset(self, options):
965 name,sites=self._get_indices(options,set_type=CHARSET) 966 self.charsets[name]=_make_unique(sites)
967
968 - def _taxpartition(self, options):
969 taxpartition={} 970 quotelevel=False 971 opts=CharBuffer(options) 972 name=self._name_n_vector(opts) 973 if not name: 974 raise NexusError('Formatting error in taxpartition: %s ' % options) 975 # now collect thesubbpartitions and parse them 976 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 977 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 978 sub='' 979 while True: 980 w=opts.next() 981 if w is None or (w==',' and not quotelevel): 982 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 983 taxpartition[subname]=_make_unique(subindices) 984 sub='' 985 if w is None: 986 break 987 else: 988 if w=="'": 989 quotelevel=not quotelevel 990 sub+=w 991 self.taxpartitions[name]=taxpartition
992
993 - def _codonposset(self,options):
994 """Read codon positions from a codons block as written from McClade. 995 996 Here codonposset is just a fancy name for a character partition with 997 the name CodonPositions and the partitions N,1,2,3 998 """ 999 1000 prev_partitions=self.charpartitions 1001 self._charpartition(options) 1002 # mcclade calls it CodonPositions, but you never know... 1003 codonname=[n for n in self.charpartitions if n not in prev_partitions] 1004 if codonname==[] or len(codonname)>1: 1005 raise NexusError('Formatting Error in codonposset: %s ' % options) 1006 else: 1007 self.codonposset=codonname[0]
1008
1009 - def _codeset(self,options):
1010 pass
1011
1012 - def _charpartition(self, options):
1013 charpartition={} 1014 quotelevel=False 1015 opts=CharBuffer(options) 1016 name=self._name_n_vector(opts) 1017 if not name: 1018 raise NexusError('Formatting error in charpartition: %s ' % options) 1019 # now collect thesubbpartitions and parse them 1020 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1021 sub='' 1022 while True: 1023 w=opts.next() 1024 if w is None or (w==',' and not quotelevel): 1025 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 1026 charpartition[subname]=_make_unique(subindices) 1027 sub='' 1028 if w is None: 1029 break 1030 else: 1031 if w=="'": 1032 quotelevel=not quotelevel 1033 sub+=w 1034 self.charpartitions[name]=charpartition
1035
1036 - def _get_indices(self,options,set_type=CHARSET,separator='='):
1037 """Parse the taxset/charset specification (PRIVATE). 1038 1039 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1040 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1041 """ 1042 opts=CharBuffer(options) 1043 name=self._name_n_vector(opts,separator=separator) 1044 indices=self._parse_list(opts,set_type=set_type) 1045 if indices is None: 1046 raise NexusError('Formatting error in line: %s ' % options) 1047 return name,indices
1048
1049 - def _name_n_vector(self,opts,separator='='):
1050 """Extract name and check that it's not in vector format.""" 1051 rest=opts.rest() 1052 name=opts.next_word() 1053 # we ignore * before names 1054 if name=='*': 1055 name=opts.next_word() 1056 if not name: 1057 raise NexusError('Formatting error in line: %s ' % rest) 1058 name=quotestrip(name) 1059 if opts.peek_nonwhitespace=='(': 1060 open=opts.next_nonwhitespace() 1061 qualifier=open.next_word() 1062 close=opts.next_nonwhitespace() 1063 if qualifier.lower()=='vector': 1064 raise NexusError('Unsupported VECTOR format in line %s' \ 1065 % (opts)) 1066 elif qualifier.lower()!='standard': 1067 raise NexusError('Unknown qualifier %s in line %s' \ 1068 % (qualifier, opts)) 1069 if opts.next_nonwhitespace()!=separator: 1070 raise NexusError('Formatting error in line: %s ' % rest) 1071 return name
1072
1073 - def _parse_list(self,options_buffer,set_type):
1074 """Parse a NEXUS list (PRIVATE). 1075 1076 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1077 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1078 """ 1079 plain_list=[] 1080 if options_buffer.peek_nonwhitespace(): 1081 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1082 while True: 1083 identifier=options_buffer.next_word() # next list element 1084 if not identifier: # end of list? 1085 break 1086 start=self._resolve(identifier,set_type=set_type) 1087 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1088 end=start 1089 step=1 1090 # get hyphen and end of range 1091 hyphen=options_buffer.next_nonwhitespace() 1092 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1093 if set_type==CHARSET: 1094 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1095 backslash=options_buffer.next_nonwhitespace() 1096 step=int(options_buffer.next_word()) # get backslash and step 1097 plain_list.extend(range(start,end+1,step)) 1098 else: 1099 if type(start)==list or type(end)==list: 1100 raise NexusError('Name if character sets not allowed in range definition: %s'\ 1101 % identifier) 1102 start=self.taxlabels.index(start) 1103 end=self.taxlabels.index(end) 1104 taxrange=self.taxlabels[start:end+1] 1105 plain_list.extend(taxrange) 1106 else: 1107 if type(start)==list: # start was the name of charset or taxset 1108 plain_list.extend(start) 1109 else: # start was an ordinary identifier 1110 plain_list.append(start) 1111 except NexusError: 1112 raise 1113 except: 1114 return None 1115 return plain_list
1116
1117 - def _resolve(self,identifier,set_type=None):
1118 """Translate identifier in list into character/taxon index. 1119 1120 Characters (which are referred to by their index in Nexus.py): 1121 Plain numbers are returned minus 1 (Nexus indices to python indices) 1122 Text identifiers are translaterd into their indices (if plain character indentifiers), 1123 the first hit in charlabels is returned (charlabels don't need to be unique) 1124 or the range of indices is returned (if names of character sets). 1125 Taxa (which are referred to by their unique name in Nexus.py): 1126 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1127 Names are returned unchanged (if plain taxon identifiers), or the names in 1128 the corresponding taxon set is returned 1129 """ 1130 identifier=quotestrip(identifier) 1131 if not set_type: 1132 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1133 if set_type==CHARSET: 1134 try: 1135 n=int(identifier) 1136 except ValueError: 1137 if self.charlabels and identifier in self.charlabels.itervalues(): 1138 for k in self.charlabels: 1139 if self.charlabels[k]==identifier: 1140 return k 1141 elif self.charsets and identifier in self.charsets: 1142 return self.charsets[identifier] 1143 else: 1144 raise NexusError('Unknown character identifier: %s' \ 1145 % identifier) 1146 else: 1147 if n<=self.nchar: 1148 return n-1 1149 else: 1150 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \ 1151 % (identifier,self.nchar)) 1152 elif set_type==TAXSET: 1153 try: 1154 n=int(identifier) 1155 except ValueError: 1156 taxlabels_id=self._check_taxlabels(identifier) 1157 if taxlabels_id: 1158 return taxlabels_id 1159 elif self.taxsets and identifier in self.taxsets: 1160 return self.taxsets[identifier] 1161 else: 1162 raise NexusError('Unknown taxon identifier: %s' \ 1163 % identifier) 1164 else: 1165 if n>0 and n<=self.ntax: 1166 return self.taxlabels[n-1] 1167 else: 1168 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \ 1169 % (identifier,self.ntax)) 1170 else: 1171 raise NexusError('Unknown set specification: %s.'% set_type)
1172
1173 - def _stateset(self, options):
1174 #Not implemented 1175 pass
1176
1177 - def _changeset(self, options):
1178 #Not implemented 1179 pass
1180
1181 - def _treeset(self, options):
1182 #Not implemented 1183 pass
1184
1185 - def _treepartition(self, options):
1186 #Not implemented 1187 pass
1188
1189 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1190 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1191 """Writes a nexus file for each partition in charpartition. 1192 1193 Only non-excluded characters and non-deleted taxa are included, 1194 just the data block is written. 1195 """ 1196 1197 if not matrix: 1198 matrix=self.matrix 1199 if not matrix: 1200 return 1201 if not filename: 1202 filename=self.filename 1203 if charpartition: 1204 pfilenames={} 1205 for p in charpartition: 1206 total_exclude=[]+exclude 1207 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1208 total_exclude=_make_unique(total_exclude) 1209 pcomment=comment+'\nPartition: '+p+'\n' 1210 dot=filename.rfind('.') 1211 if dot>0: 1212 pfilename=filename[:dot]+'_'+p+'.data' 1213 else: 1214 pfilename=filename+'_'+p 1215 pfilenames[p]=pfilename 1216 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1217 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1218 mrbayes=mrbayes) 1219 return pfilenames 1220 else: 1221 fn=self.filename+'.data' 1222 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1223 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1224 mrbayes=mrbayes) 1225 return fn
1226
1227 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ 1228 blocksize=None, interleave=False, interleave_by_partition=False,\ 1229 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\ 1230 codons_block=True):
1231 """Writes a nexus file with data and sets block to a file or handle. 1232 1233 Character sets and partitions are appended by default, and are 1234 adjusted according to excluded characters (i.e. character sets 1235 still point to the same sites (not necessarily same positions), 1236 without including the deleted characters. 1237 1238 filename - Either a filename as a string (which will be opened, 1239 written to and closed), or a handle object (which will 1240 be written to but NOT closed). 1241 interleave_by_partition - Optional name of partition (string) 1242 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1243 start of the file is ommited. 1244 1245 Returns the filename/handle used to write the data. 1246 """ 1247 if not matrix: 1248 matrix=self.matrix 1249 if not matrix: 1250 return 1251 if not filename: 1252 filename=self.filename 1253 if [t for t in delete if not self._check_taxlabels(t)]: 1254 raise NexusError('Unknown taxa: %s' \ 1255 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1256 if interleave_by_partition: 1257 if not interleave_by_partition in self.charpartitions: 1258 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1259 else: 1260 partition=self.charpartitions[interleave_by_partition] 1261 # we need to sort the partition names by starting position before we exclude characters 1262 names=_sort_keys_by_values(partition) 1263 newpartition={} 1264 for p in partition: 1265 newpartition[p]=[c for c in partition[p] if c not in exclude] 1266 # how many taxa and how many characters are left? 1267 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1268 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1269 ntax_adjusted=len(undelete) 1270 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1271 if not undelete or (undelete and undelete[0]==''): 1272 return 1273 if isinstance(filename,basestring): 1274 try: 1275 fh=open(filename,'w',encoding="utf-8") 1276 except IOError: 1277 raise NexusError('Could not open %s for writing.' % filename) 1278 elif hasattr(filename, 'write'): 1279 #e.g. StringIO or a real file handle 1280 fh=filename 1281 else: 1282 raise ValueError("Neither a filename nor a handle was supplied") 1283 if not omit_NEXUS: 1284 fh.write('#NEXUS\n') 1285 if comment: 1286 fh.write('['+comment+']\n') 1287 fh.write('begin data;\n') 1288 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1289 fh.write('\tformat datatype='+self.datatype) 1290 if self.respectcase: 1291 fh.write(' respectcase') 1292 if self.missing: 1293 fh.write(' missing='+self.missing) 1294 if self.gap: 1295 fh.write(' gap='+self.gap) 1296 if self.matchchar: 1297 fh.write(' matchchar='+self.matchchar) 1298 if self.labels: 1299 fh.write(' labels='+self.labels) 1300 if self.equate: 1301 fh.write(' equate='+self.equate) 1302 if interleave or interleave_by_partition: 1303 fh.write(' interleave') 1304 fh.write(';\n') 1305 #if self.taxlabels: 1306 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1307 if self.charlabels: 1308 newcharlabels=self._adjust_charlabels(exclude=exclude) 1309 clkeys=sorted(newcharlabels) 1310 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1311 fh.write('matrix\n') 1312 if not blocksize: 1313 if interleave: 1314 blocksize=70 1315 else: 1316 blocksize=self.nchar 1317 # delete deleted taxa and ecxclude excluded characters... 1318 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1319 if interleave_by_partition: 1320 # interleave by partitions, but adjust partitions with regard to excluded characters 1321 seek=0 1322 for p in names: 1323 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1324 if len(newpartition[p])>0: 1325 for taxon in undelete: 1326 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1327 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1328 fh.write('\n') 1329 else: 1330 fh.write('[empty]\n\n') 1331 seek+=len(newpartition[p]) 1332 elif interleave: 1333 for seek in range(0,nchar_adjusted,blocksize): 1334 for taxon in undelete: 1335 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1336 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1337 fh.write('\n') 1338 else: 1339 for taxon in undelete: 1340 if blocksize<nchar_adjusted: 1341 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1342 else: 1343 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1344 for seek in range(0,nchar_adjusted,blocksize): 1345 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1346 fh.write(';\nend;\n') 1347 if append_sets: 1348 if codons_block: 1349 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False)) 1350 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True)) 1351 else: 1352 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1353 1354 if fh == filename: 1355 #We were given the handle, don't close it. 1356 return filename 1357 else: 1358 #We opened the handle, so we should close it. 1359 fh.close() 1360 return filename
1361
1362 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1363 """Returns a sets block.""" 1364 if not self.charsets and not self.taxsets and not self.charpartitions: 1365 return '' 1366 if codons_only: 1367 setsb=['\nbegin codons'] 1368 else: 1369 setsb=['\nbegin sets'] 1370 # - now if characters have been excluded, the character sets need to be adjusted, 1371 # so that they still point to the right character positions 1372 # calculate a list of offsets: for each deleted character, the following character position 1373 # in the new file will have an additional offset of -1 1374 offset=0 1375 offlist=[] 1376 for c in range(self.nchar): 1377 if c in exclude: 1378 offset+=1 1379 offlist.append(-1) # dummy value as these character positions are excluded 1380 else: 1381 offlist.append(c-offset) 1382 # now adjust each of the character sets 1383 if not codons_only: 1384 for n,ns in self.charsets.iteritems(): 1385 cset=[offlist[c] for c in ns if c not in exclude] 1386 if cset: 1387 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1388 for n,s in self.taxsets.iteritems(): 1389 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1390 if tset: 1391 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1392 for n,p in self.charpartitions.iteritems(): 1393 if not include_codons and n==CODONPOSITIONS: 1394 continue 1395 elif codons_only and n!=CODONPOSITIONS: 1396 continue 1397 # as characters have been excluded, the partitions must be adjusted 1398 # if a partition is empty, it will be omitted from the charpartition command 1399 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1400 names=_sort_keys_by_values(p) 1401 newpartition={} 1402 for sn in names: 1403 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1404 if nsp: 1405 newpartition[sn]=nsp 1406 if newpartition: 1407 if include_codons and n==CODONPOSITIONS: 1408 command='codonposset' 1409 else: 1410 command='charpartition' 1411 setsb.append('%s %s = %s' % (command,safename(n),\ 1412 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1413 # now write charpartititions, much easier than charpartitions 1414 for n,p in self.taxpartitions.iteritems(): 1415 names=_sort_keys_by_values(p) 1416 newpartition={} 1417 for sn in names: 1418 nsp=[t for t in p[sn] if t not in delete] 1419 if nsp: 1420 newpartition[sn]=nsp 1421 if newpartition: 1422 setsb.append('taxpartition %s = %s' % (safename(n),\ 1423 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1424 # add 'end' and return everything 1425 setsb.append('end;\n') 1426 if len(setsb)==2: # begin and end only 1427 return '' 1428 else: 1429 return ';\n'.join(setsb)
1430
1431 - def export_fasta(self, filename=None, width=70):
1432 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1433 if not filename: 1434 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1435 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1436 else: 1437 filename=self.filename+'.fas' 1438 fh=open(filename,'w') 1439 for taxon in self.taxlabels: 1440 fh.write('>'+safename(taxon)+'\n') 1441 for i in range(0, len(self.matrix[taxon].tostring()), width): 1442 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') 1443 fh.close() 1444 return filename
1445
1446 - def export_phylip(self, filename=None):
1447 """Writes matrix into a PHYLIP file: (self, filename=None). 1448 1449 Note that this writes a relaxed PHYLIP format file, where the names 1450 are not truncated, nor checked for invalid characters.""" 1451 if not filename: 1452 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1453 filename='.'.join(self.filename.split('.')[:-1])+'.phy' 1454 else: 1455 filename=self.filename+'.phy' 1456 fh=open(filename,'w') 1457 fh.write('%d %d\n' % (self.ntax,self.nchar)) 1458 for taxon in self.taxlabels: 1459 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring())) 1460 fh.close() 1461 return filename
1462
1463 - def constant(self,matrix=None,delete=[],exclude=[]):
1464 """Return a list with all constant characters.""" 1465 if not matrix: 1466 matrix=self.matrix 1467 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1468 if not undelete: 1469 return None 1470 elif len(undelete)==1: 1471 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1472 # get the first sequence and expand all ambiguous values 1473 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1474 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] 1475 for taxon in undelete[1:]: 1476 newconstant=[] 1477 for site in constant: 1478 #print '%d (paup=%d)' % (site[0],site[0]+1), 1479 seqsite=matrix[taxon][site[0]].upper() 1480 #print seqsite,'checked against',site[1],'\t', 1481 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1482 # missing or same as before -> ok 1483 newconstant.append(site) 1484 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1485 # subset of an ambig or only missing in previous -> take subset 1486 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1487 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1488 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1])) 1489 if intersect: 1490 newconstant.append((site[0],''.join(intersect))) 1491 # print 'ok' 1492 #else: 1493 # print 'failed' 1494 #else: 1495 # print 'failed' 1496 constant=newconstant 1497 cpos=[s[0] for s in constant] 1498 return cpos
1499
1500 - def cstatus(self,site,delete=[],narrow=True):
1501 """Summarize character. 1502 1503 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1504 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1505 """ 1506 undelete=[t for t in self.taxlabels if t not in delete] 1507 if not undelete: 1508 return None 1509 cstatus=[] 1510 for t in undelete: 1511 c=self.matrix[t][site].upper() 1512 if self.options.get('gapmode')=='missing' and c==self.gap: 1513 c=self.missing 1514 if narrow and c==self.missing: 1515 if c not in cstatus: 1516 cstatus.append(c) 1517 else: 1518 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1519 if self.missing in cstatus and narrow and len(cstatus)>1: 1520 cstatus=[c for c in cstatus if c!=self.missing] 1521 cstatus.sort() 1522 return cstatus
1523
1524 - def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1525 """Calculates a stepmatrix for weighted parsimony. 1526 1527 See Wheeler (1990), Cladistics 6:269-275 and 1528 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1529 """ 1530 m=StepMatrix(self.unambiguous_letters,self.gap) 1531 for site in [s for s in range(self.nchar) if s not in exclude]: 1532 cstatus=self.cstatus(site,delete) 1533 for i,b1 in enumerate(cstatus[:-1]): 1534 for b2 in cstatus[i+1:]: 1535 m.add(b1.upper(),b2.upper(),1) 1536 return m.transformation().weighting().smprint(name=name)
1537
1538 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1539 """Return a matrix without deleted taxa and excluded characters.""" 1540 if not matrix: 1541 matrix=self.matrix 1542 if [t for t in delete if not self._check_taxlabels(t)]: 1543 raise NexusError('Unknown taxa: %s' \ 1544 % ', '.join(set(delete).difference(self.taxlabels))) 1545 if exclude!=[]: 1546 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1547 if not undelete: 1548 return {} 1549 m=[matrix[k].tostring() for k in undelete] 1550 zipped_m=zip(*m) 1551 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] 1552 if sitesm==[]: 1553 return dict([(t,Seq('',self.alphabet)) for t in undelete]) 1554 else: 1555 zipped_sitesm=zip(*sitesm) 1556 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] 1557 return dict(zip(undelete,m)) 1558 else: 1559 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1560
1561 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1562 """Return a bootstrapped matrix.""" 1563 if not matrix: 1564 matrix=self.matrix 1565 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects 1566 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out 1567 if not cm: # everything deleted? 1568 return {} 1569 elif len(cm[cm.keys()[0]])==0: # everything excluded? 1570 return cm 1571 undelete=[t for t in self.taxlabels if t in cm] 1572 if seqobjects: 1573 sitesm=zip(*[cm[t].tostring() for t in undelete]) 1574 alphabet=matrix[matrix.keys()[0]].alphabet 1575 else: 1576 sitesm=zip(*[cm[t] for t in undelete]) 1577 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] 1578 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) 1579 if seqobjects: 1580 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] 1581 return dict(zip(undelete,bootstrapseqs))
1582
1583 - def add_sequence(self,name,sequence):
1584 """Adds a sequence (string) to the matrix.""" 1585 1586 if not name: 1587 raise NexusError('New sequence must have a name') 1588 1589 diff=self.nchar-len(sequence) 1590 if diff<0: 1591 self.insert_gap(self.nchar,-diff) 1592 elif diff>0: 1593 sequence+=self.missing*diff 1594 1595 if name in self.taxlabels: 1596 unique_name=_unique_label(self.taxlabels,name) 1597 #print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name) 1598 else: 1599 unique_name=name 1600 1601 assert unique_name not in self.matrix, "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug." 1602 1603 self.matrix[unique_name]=Seq(sequence,self.alphabet) 1604 self.ntax+=1 1605 self.taxlabels.append(unique_name) 1606 self.unaltered_taxlabels.append(name)
1607
1608 - def insert_gap(self,pos,n=1,leftgreedy=False):
1609 """Add a gap into the matrix and adjust charsets and partitions. 1610 1611 pos=0: first position 1612 pos=nchar: last position 1613 """ 1614 1615 def _adjust(set,x,d,leftgreedy=False): 1616 """Adjusts chartacter sets if gaps are inserted, taking care of 1617 new gaps within a coherent character set.""" 1618 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 1619 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 1620 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 1621 set.sort() 1622 addpos=0 1623 for i,c in enumerate(set): 1624 if c>=x: 1625 set[i]=c+d 1626 # if we add gaps within a group of characters, we want the gap position included in this group 1627 if c==x: 1628 if leftgreedy or (i>0 and set[i-1]==c-1): 1629 addpos=i 1630 if addpos>0: 1631 set[addpos:addpos]=range(x,x+d) 1632 return set
1633 1634 if pos<0 or pos>self.nchar: 1635 raise NexusError('Illegal gap position: %d' % pos) 1636 if n==0: 1637 return 1638 if self.taxlabels: 1639 #python 2.3 does not support zip(*[]) 1640 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1641 else: 1642 sitesm=[] 1643 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n 1644 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ 1645 # i,taxon in enumerate(self.taxlabels)]) 1646 zipped=zip(*sitesm) 1647 mapped=map(''.join,zipped) 1648 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] 1649 self.matrix=dict(listed) 1650 self.nchar+=n 1651 # now adjust character sets 1652 for i,s in self.charsets.iteritems(): 1653 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1654 for p in self.charpartitions: 1655 for sp,s in self.charpartitions[p].iteritems(): 1656 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1657 # now adjust character state labels 1658 self.charlabels=self._adjust_charlabels(insert=[pos]*n) 1659 return self.charlabels 1660
1661 - def _adjust_charlabels(self,exclude=None,insert=None):
1662 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" 1663 if exclude and insert: 1664 raise NexusError('Can\'t exclude and insert at the same time') 1665 if not self.charlabels: 1666 return None 1667 labels=sorted(self.charlabels) 1668 newcharlabels={} 1669 if exclude: 1670 exclude.sort() 1671 exclude.append(sys.maxint) 1672 excount=0 1673 for c in labels: 1674 if not c in exclude: 1675 while c>exclude[excount]: 1676 excount+=1 1677 newcharlabels[c-excount]=self.charlabels[c] 1678 elif insert: 1679 insert.sort() 1680 insert.append(sys.maxint) 1681 icount=0 1682 for c in labels: 1683 while c>=insert[icount]: 1684 icount+=1 1685 newcharlabels[c+icount]=self.charlabels[c] 1686 else: 1687 return self.charlabels 1688 return newcharlabels
1689
1690 - def invert(self,charlist):
1691 """Returns all character indices that are not in charlist.""" 1692 return [c for c in range(self.nchar) if c not in charlist]
1693
1694 - def gaponly(self,include_missing=False):
1695 """Return gap-only sites.""" 1696 gap=set(self.gap) 1697 if include_missing: 1698 gap.add(self.missing) 1699 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1700 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)] 1701 return gaponly
1702
1703 - def terminal_gap_to_missing(self,missing=None,skip_n=True):
1704 """Replaces all terminal gaps with missing character. 1705 1706 Mixtures like ???------??------- are properly resolved.""" 1707 1708 if not missing: 1709 missing=self.missing 1710 replace=[self.missing,self.gap] 1711 if not skip_n: 1712 replace.extend(['n','N']) 1713 for taxon in self.taxlabels: 1714 sequence=self.matrix[taxon].tostring() 1715 length=len(sequence) 1716 start,end=get_start_end(sequence,skiplist=replace) 1717 if start==-1 and end==-1: 1718 sequence=missing*length 1719 else: 1720 sequence=sequence[:end+1]+missing*(length-end-1) 1721 sequence=start*missing+sequence[start:] 1722 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon 1723 self.matrix[taxon]=Seq(sequence,self.alphabet)
1724 1725 1726 try: 1727 import cnexus 1728 except ImportError:
1729 - def _get_command_lines(file_contents):
1730 lines=_kill_comments_and_break_lines(file_contents) 1731 commandlines=_adjust_lines(lines) 1732 return commandlines
1733 else:
1734 - def _get_command_lines(file_contents):
1735 decommented=cnexus.scanfile(file_contents) 1736 #check for unmatched parentheses 1737 if decommented=='[' or decommented==']': 1738 raise NexusError('Unmatched %s' % decommented) 1739 # cnexus can't return lists, so in analogy we separate 1740 # commandlines with chr(7) (a character that shoudn't be part of a 1741 # nexus file under normal circumstances) 1742 commandlines=_adjust_lines(decommented.split(chr(7))) 1743 return commandlines
1744