1
2
3
4
5
6
7 """Nexus class. Parse the contents of a NEXUS file.
8
9 Based upon 'NEXUS: An extensible file format for systematic information'
10 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
11 """
12
13 import os,sys, math, random, copy
14
15 from Bio.Alphabet import IUPAC
16 from Bio.Data import IUPACData
17 from Bio.Seq import Seq
18
19 from Trees import Tree,NodeData
20
21 INTERLEAVE=70
22 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
23 'matrix','tree', 'utree','translate','codonposset','title']
24 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
25 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
26 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
27 WHITESPACE=' \t\n'
28
29 SPECIALCOMMENTS=['&']
30 CHARSET='chars'
31 TAXSET='taxa'
32 CODONPOSITIONS='codonpositions'
33 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
35
37 """Helps reading NEXUS-words and characters from a buffer."""
39 if string:
40 self.buffer=list(string)
41 else:
42 self.buffer=[]
43
45 if self.buffer:
46 return self.buffer[0]
47 else:
48 return None
49
51 b=''.join(self.buffer).strip()
52 if b:
53 return b[0]
54 else:
55 return None
56
58 if self.buffer:
59 return self.buffer.pop(0)
60 else:
61 return None
62
64 while True:
65 p=self.next()
66 if p is None:
67 break
68 if p not in WHITESPACE:
69 return p
70 return None
71
73 while self.buffer[0] in WHITESPACE:
74 self.buffer=self.buffer[1:]
75
77 for t in target:
78 try:
79 pos=self.buffer.index(t)
80 except ValueError:
81 pass
82 else:
83 found=''.join(self.buffer[:pos])
84 self.buffer=self.buffer[pos:]
85 return found
86 else:
87 return None
88
90 return ''.join(self.buffer[:len(word)])==word
91
93 """Return the next NEXUS word from a string.
94
95 This deals with single and double quotes, whitespace and punctuation.
96 """
97
98 word=[]
99 quoted=False
100 first=self.next_nonwhitespace()
101 if not first:
102 return None
103 word.append(first)
104 if first=="'":
105 quoted="'"
106 elif first=='"':
107 quoted='"'
108 elif first in PUNCTUATION:
109 return first
110 while True:
111 c=self.peek()
112 if c==quoted:
113 word.append(self.next())
114 if self.peek()==quoted:
115 skip=self.next()
116 elif quoted:
117 break
118 elif quoted:
119 word.append(self.next())
120 elif not c or c in PUNCTUATION or c in WHITESPACE:
121 break
122 else:
123 word.append(self.next())
124 return ''.join(word)
125
127 """Return the rest of the string without parsing."""
128 return ''.join(self.buffer)
129
131 """Calculate a stepmatrix for weighted parsimony.
132
133 See Wheeler (1990), Cladistics 6:269-275.
134 """
135
137 self.data={}
138 self.symbols=[s for s in symbols]
139 self.symbols.sort()
140 if gap:
141 self.symbols.append(gap)
142 for x in self.symbols:
143 for y in [s for s in self.symbols if s!=x]:
144 self.set(x,y,0)
145
146 - def set(self,x,y,value):
150
151 - def add(self,x,y,value):
155
158
165
167 for k in self.data:
168 if self.data[k]!=0:
169 self.data[k]=-math.log(self.data[k])
170 return self
171
172 - def smprint(self,name='your_name_here'):
173 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
174 matrix+=' %s\n' % ' '.join(self.symbols)
175 for x in self.symbols:
176 matrix+='[%s]'.ljust(8) % x
177 for y in self.symbols:
178 if x==y:
179 matrix+=' . '
180 else:
181 if x>y:
182 x1,y1=y,x
183 else:
184 x1,y1=x,y
185 if self.data[x1+y1]==0:
186 matrix+='inf. '
187 else:
188 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
189 matrix+='\n'
190 matrix+=';\n'
191 return matrix
192
194 """Return a taxon identifier according to NEXUS standard.
195
196 Wrap quotes around names with punctuation or whitespace, and double
197 single quotes.
198
199 mrbayes=True: write names without quotes, whitespace or punctuation
200 for the mrbayes software package.
201 """
202 if mrbayes:
203 safe=name.replace(' ','_')
204 safe=''.join([c for c in safe if c in MRBAYESSAFE])
205 else:
206 safe=name.replace("'","''")
207 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)):
208 safe="'"+safe+"'"
209 return safe
210
212 """Remove quotes and/or double quotes around identifiers."""
213 if not word:
214 return None
215 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
216 word=word[1:-1]
217 return word
218
237
239 """Returns a sorted list of keys of p sorted by values of p."""
240 startpos=[(p[pn],pn) for pn in p if p[pn]]
241 startpos.sort()
242
243 return (zip(*startpos))[1]
244
246 """Check that all values in list are unique and return a pruned and sorted list."""
247 l=list(set(l))
248 l.sort()
249 return l
250
252 """Returns a unique name if label is already in previous_labels."""
253 while label in previous_labels:
254 if label.split('.')[-1].startswith('copy'):
255 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
256 else:
257 label+='.copy'
258 return label
259
261 """Converts a Seq-object matrix to a plain sequence-string matrix."""
262 return dict([(t,matrix[t].tostring()) for t in matrix])
263
265 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
266 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
267 """
268
269 if not orig_list:
270 return ''
271 orig_list=list(set(orig_list))
272 orig_list.sort()
273 shortlist=[]
274 clist=orig_list[:]
275 clist.append(clist[-1]+.5)
276 while len(clist)>1:
277 step=1
278 for i,x in enumerate(clist):
279 if x==clist[0]+i*step:
280 continue
281 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
282
283
284 step=x-clist[0]
285 else:
286 sub=clist[:i]
287 if len(sub)==1:
288 shortlist.append(str(sub[0]+1))
289 else:
290 if step==1:
291 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
292 else:
293 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
294 clist=clist[i:]
295 break
296 return ' '.join(shortlist)
297
299 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
300
301 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
302 Character sets, character partitions and taxon sets are prefixed, readjusted
303 and present in the combined matrix.
304 """
305
306 if not matrices:
307 return None
308 name=matrices[0][0]
309 combined=copy.deepcopy(matrices[0][1])
310 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1)
311 if mixed_datatypes:
312 combined.datatype='None'
313
314 combined.charlabels=None
315 combined.statelabels=None
316 combined.interleave=False
317 combined.translate=None
318
319
320 for cn,cs in combined.charsets.iteritems():
321 combined.charsets['%s.%s' % (name,cn)]=cs
322 del combined.charsets[cn]
323 for tn,ts in combined.taxsets.iteritems():
324 combined.taxsets['%s.%s' % (name,tn)]=ts
325 del combined.taxsets[tn]
326
327
328 combined.charpartitions={'combined':{name:range(combined.nchar)}}
329 for n,m in matrices[1:]:
330 both=[t for t in combined.taxlabels if t in m.taxlabels]
331 combined_only=[t for t in combined.taxlabels if t not in both]
332 m_only=[t for t in m.taxlabels if t not in both]
333 for t in both:
334
335 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
336
337 for t in combined_only:
338 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
339 for t in m_only:
340 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
341 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
342 combined.taxlabels.extend(m_only)
343 for cn,cs in m.charsets.iteritems():
344 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
345 if m.taxsets:
346 if not combined.taxsets:
347 combined.taxsets={}
348
349 combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) \
350 for tn,ts in m.taxsets.iteritems()))
351
352 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
353
354 if m.charlabels:
355 if not combined.charlabels:
356 combined.charlabels={}
357 combined.charlabels.update(dict((combined.nchar+i,label) \
358 for (i,label) in m.charlabels.iteritems()))
359 combined.nchar+=m.nchar
360 combined.ntax+=len(m_only)
361
362
363
364 for c in combined.charpartitions['combined']:
365 combined.charsets[c]=combined.charpartitions['combined'][c]
366
367 return combined
368
427
428
430 """Adjust linebreaks to match ';', strip leading/trailing whitespace.
431
432 list_of_commandlines=_adjust_lines(input_text)
433 Lines are adjusted so that no linebreaks occur within a commandline
434 (except matrix command line)
435 """
436 formatted_lines=[]
437 for l in lines:
438
439 l=l.replace('\r\n','\n').replace('\r','\n').strip()
440 if l.lower().startswith('matrix'):
441 formatted_lines.append(l)
442 else:
443 l=l.replace('\n',' ')
444 if l:
445 formatted_lines.append(l)
446 return formatted_lines
447
449 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
450
451 opening=seq.find('(')
452 while opening>-1:
453 closing=seq.find(')')
454 if closing<0:
455 raise NexusError('Missing closing parenthesis in: '+seq)
456 elif closing<opening:
457 raise NexusError('Missing opening parenthesis in: '+seq)
458 ambig=[x for x in seq[opening+1:closing]]
459 ambig.sort()
460 ambig=''.join(ambig)
461 ambig_code=rev_ambig_values[ambig.upper()]
462 if ambig!=ambig.upper():
463 ambig_code=ambig_code.lower()
464 seq=seq[:opening]+ambig_code+seq[closing+1:]
465 opening=seq.find('(')
466 return seq
467
469 """Represent a commandline as command and options."""
470
472 self.options={}
473 options=[]
474 self.command=None
475 try:
476
477 self.command, options = line.strip().split('\n', 1)
478 except ValueError:
479
480 self.command=line.split()[0]
481 options=' '.join(line.split()[1:])
482 self.command = self.command.strip().lower()
483 if self.command in SPECIAL_COMMANDS:
484 self.options=options.strip()
485 else:
486 if len(options) > 0:
487 try:
488 options = options.replace('=', ' = ').split()
489 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
490 indices = []
491 for sl in valued_indices:
492 indices.extend(sl)
493 token_indices = [n for n in range(len(options)) if n not in indices]
494 for opt in valued_indices:
495
496 self.options[options[opt[0]].lower()] = options[opt[2]]
497 for token in token_indices:
498 self.options[options[token].lower()] = None
499 except ValueError:
500 raise NexusError('Incorrect formatting in line: %s' % line)
501
503 """Represent a NEXUS block with block name and list of commandlines."""
507
509
510 __slots__=['original_taxon_order','__dict__']
511
513 self.ntax=0
514 self.nchar=0
515 self.unaltered_taxlabels=[]
516 self.taxlabels=[]
517 self.charlabels=None
518 self.statelabels=None
519 self.datatype='dna'
520 self.respectcase=False
521 self.missing='?'
522 self.gap='-'
523 self.symbols=None
524 self.equate=None
525 self.matchchar=None
526 self.labels=None
527 self.transpose=False
528 self.interleave=False
529 self.tokens=False
530 self.eliminate=None
531 self.matrix=None
532 self.unknown_blocks=[]
533 self.taxsets={}
534 self.charsets={}
535 self.charpartitions={}
536 self.taxpartitions={}
537 self.trees=[]
538 self.translate=None
539 self.structured=[]
540 self.set={}
541 self.options={}
542 self.codonposset=None
543
544
545 self.options['gapmode']='missing'
546
547 if input:
548 self.read(input)
549 else:
550 self.read(DEFAULTNEXUS)
551
553 """Included for backwards compatibility (DEPRECATED)."""
554 return self.taxlabels
556 """Included for backwards compatibility (DEPRECATED)."""
557 self.taxlabels=value
558 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
559
560 - def read(self,input):
561 """Read and parse NEXUS imput (a filename, file-handle, or string)."""
562
563
564
565 try:
566 file_contents = open(os.path.expanduser(input),'rU').read()
567 self.filename=input
568 except (TypeError,IOError,AttributeError):
569
570 if isinstance(input, str):
571 file_contents = input
572 self.filename='input_string'
573
574 elif hasattr(input,'read'):
575 file_contents=input.read()
576 if hasattr(input,"name") and input.name:
577 self.filename=input.name
578 else:
579 self.filename='Unknown_nexus_file'
580 else:
581 print input.strip()[:50]
582 raise NexusError('Unrecognized input: %s ...' % input[:100])
583 file_contents=file_contents.strip()
584 if file_contents.startswith('#NEXUS'):
585 file_contents=file_contents[6:]
586 commandlines=_get_command_lines(file_contents)
587
588 for i,cl in enumerate(commandlines):
589 try:
590 if cl[:6].upper()=='#NEXUS':
591 commandlines[i]=cl[6:].strip()
592 except:
593 pass
594
595 nexus_block_gen = self._get_nexus_block(commandlines)
596 while 1:
597 try:
598 title, contents = nexus_block_gen.next()
599 except StopIteration:
600 break
601 if title in KNOWN_NEXUS_BLOCKS:
602 self._parse_nexus_block(title, contents)
603 else:
604 self._unknown_nexus_block(title, contents)
605
607 """Generator for looping through Nexus blocks."""
608 inblock=False
609 blocklines=[]
610 while file_contents:
611 cl=file_contents.pop(0)
612 if cl.lower().startswith('begin'):
613 if not inblock:
614 inblock=True
615 title=cl.split()[1].lower()
616 else:
617 raise NexusError('Illegal block nesting in block %s' % title)
618 elif cl.lower().startswith('end'):
619 if inblock:
620 inblock=False
621 yield title,blocklines
622 blocklines=[]
623 else:
624 raise NexusError('Unmatched \'end\'.')
625 elif inblock:
626 blocklines.append(cl)
627
633
635 """Parse a known Nexus Block (PRIVATE)."""
636
637 self._apply_block_structure(title, contents)
638
639
640 block=self.structured[-1]
641 for line in block.commandlines:
642 try:
643 getattr(self,'_'+line.command)(line.options)
644 except AttributeError:
645 raise
646 raise NexusError('Unknown command: %s ' % line.command)
647
650
652 if 'ntax' in options:
653 self.ntax=eval(options['ntax'])
654 if 'nchar' in options:
655 self.nchar=eval(options['nchar'])
656
736
737
738 - def _set(self,options):
740
742 self.options=options;
743
745 self.eliminate=options
746
748 """Get taxon labels (PRIVATE).
749
750 As the taxon names are already in the matrix, this is superfluous
751 except for transpose matrices, which are currently unsupported anyway.
752 Thus, we ignore the taxlabels command to make handling of duplicate
753 taxon names easier.
754 """
755 pass
756
757
758
759
760
761
762
763
765 """Check for presence of taxon in self.taxlabels."""
766
767
768 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
769 nexid=taxon.replace(' ','_')
770 return nextaxa.get(nexid)
771
794
798
803
805 if not self.ntax or not self.nchar:
806 raise NexusError('Dimensions must be specified before matrix!')
807 self.matrix={}
808 taxcount=0
809 first_matrix_block=True
810
811
812 lines=[l.strip() for l in options.split('\n') if l.strip()!='']
813 lineiter=iter(lines)
814 while 1:
815 try:
816 l=lineiter.next()
817 except StopIteration:
818 if taxcount<self.ntax:
819 raise NexusError('Not enough taxa in matrix.')
820 elif taxcount>self.ntax:
821 raise NexusError('Too many taxa in matrix.')
822 else:
823 break
824
825 taxcount+=1
826
827 if taxcount>self.ntax:
828 if not self.interleave:
829 raise NexusError('Too many taxa in matrix - should matrix be interleaved?')
830 else:
831 taxcount=1
832 first_matrix_block=False
833
834 linechars=CharBuffer(l)
835 id=quotestrip(linechars.next_word())
836 l=linechars.rest().strip()
837 chars=''
838 if self.interleave:
839
840
841 if l:
842 chars=''.join(l.split())
843 else:
844 chars=''.join(lineiter.next().split())
845 else:
846
847 chars=''.join(l.split())
848 while len(chars)<self.nchar:
849 l=lineiter.next()
850 chars+=''.join(l.split())
851 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
852
853 if taxcount==1:
854 refseq=iupac_seq
855 else:
856 if self.matchchar:
857 while 1:
858 p=iupac_seq.tostring().find(self.matchchar)
859 if p==-1:
860 break
861 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
862
863 for i,c in enumerate(iupac_seq.tostring()):
864 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
865 raise NexusError( \
866 ('Taxon %s: Illegal character %s in sequence %s ' + \
867 '(check dimensions/interleaving)') % (id,c, iupac_seq))
868
869 if first_matrix_block:
870 self.unaltered_taxlabels.append(id)
871 id=_unique_label(self.matrix.keys(),id)
872 self.matrix[id]=iupac_seq
873 self.taxlabels.append(id)
874 else:
875
876 id=_unique_label(self.taxlabels[:taxcount-1],id)
877 taxon_present=self._check_taxlabels(id)
878 if taxon_present:
879 self.matrix[taxon_present]+=iupac_seq
880 else:
881 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id)
882
883 for taxon in self.matrix:
884 if len(self.matrix[taxon])!=self.nchar:
885 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
886 % (self.nchar, len(self.matrix[taxon]),taxon))
887
888 matrixkeys=sorted(self.matrix)
889 taxlabelssort=self.taxlabels[:]
890 taxlabelssort.sort()
891 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
892
912
914 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
915 self._tree(options)
916
917 - def _tree(self,options):
952
959
963
967
969 taxpartition={}
970 quotelevel=False
971 opts=CharBuffer(options)
972 name=self._name_n_vector(opts)
973 if not name:
974 raise NexusError('Formatting error in taxpartition: %s ' % options)
975
976
977
978 sub=''
979 while True:
980 w=opts.next()
981 if w is None or (w==',' and not quotelevel):
982 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
983 taxpartition[subname]=_make_unique(subindices)
984 sub=''
985 if w is None:
986 break
987 else:
988 if w=="'":
989 quotelevel=not quotelevel
990 sub+=w
991 self.taxpartitions[name]=taxpartition
992
994 """Read codon positions from a codons block as written from McClade.
995
996 Here codonposset is just a fancy name for a character partition with
997 the name CodonPositions and the partitions N,1,2,3
998 """
999
1000 prev_partitions=self.charpartitions
1001 self._charpartition(options)
1002
1003 codonname=[n for n in self.charpartitions if n not in prev_partitions]
1004 if codonname==[] or len(codonname)>1:
1005 raise NexusError('Formatting Error in codonposset: %s ' % options)
1006 else:
1007 self.codonposset=codonname[0]
1008
1011
1013 charpartition={}
1014 quotelevel=False
1015 opts=CharBuffer(options)
1016 name=self._name_n_vector(opts)
1017 if not name:
1018 raise NexusError('Formatting error in charpartition: %s ' % options)
1019
1020
1021 sub=''
1022 while True:
1023 w=opts.next()
1024 if w is None or (w==',' and not quotelevel):
1025 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1026 charpartition[subname]=_make_unique(subindices)
1027 sub=''
1028 if w is None:
1029 break
1030 else:
1031 if w=="'":
1032 quotelevel=not quotelevel
1033 sub+=w
1034 self.charpartitions[name]=charpartition
1035
1037 """Parse the taxset/charset specification (PRIVATE).
1038
1039 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3'
1040 --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1041 """
1042 opts=CharBuffer(options)
1043 name=self._name_n_vector(opts,separator=separator)
1044 indices=self._parse_list(opts,set_type=set_type)
1045 if indices is None:
1046 raise NexusError('Formatting error in line: %s ' % options)
1047 return name,indices
1048
1072
1074 """Parse a NEXUS list (PRIVATE).
1075
1076 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1077 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1078 """
1079 plain_list=[]
1080 if options_buffer.peek_nonwhitespace():
1081 try:
1082 while True:
1083 identifier=options_buffer.next_word()
1084 if not identifier:
1085 break
1086 start=self._resolve(identifier,set_type=set_type)
1087 if options_buffer.peek_nonwhitespace()=='-':
1088 end=start
1089 step=1
1090
1091 hyphen=options_buffer.next_nonwhitespace()
1092 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1093 if set_type==CHARSET:
1094 if options_buffer.peek_nonwhitespace()=='\\':
1095 backslash=options_buffer.next_nonwhitespace()
1096 step=int(options_buffer.next_word())
1097 plain_list.extend(range(start,end+1,step))
1098 else:
1099 if type(start)==list or type(end)==list:
1100 raise NexusError('Name if character sets not allowed in range definition: %s'\
1101 % identifier)
1102 start=self.taxlabels.index(start)
1103 end=self.taxlabels.index(end)
1104 taxrange=self.taxlabels[start:end+1]
1105 plain_list.extend(taxrange)
1106 else:
1107 if type(start)==list:
1108 plain_list.extend(start)
1109 else:
1110 plain_list.append(start)
1111 except NexusError:
1112 raise
1113 except:
1114 return None
1115 return plain_list
1116
1117 - def _resolve(self,identifier,set_type=None):
1118 """Translate identifier in list into character/taxon index.
1119
1120 Characters (which are referred to by their index in Nexus.py):
1121 Plain numbers are returned minus 1 (Nexus indices to python indices)
1122 Text identifiers are translaterd into their indices (if plain character indentifiers),
1123 the first hit in charlabels is returned (charlabels don't need to be unique)
1124 or the range of indices is returned (if names of character sets).
1125 Taxa (which are referred to by their unique name in Nexus.py):
1126 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1127 Names are returned unchanged (if plain taxon identifiers), or the names in
1128 the corresponding taxon set is returned
1129 """
1130 identifier=quotestrip(identifier)
1131 if not set_type:
1132 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1133 if set_type==CHARSET:
1134 try:
1135 n=int(identifier)
1136 except ValueError:
1137 if self.charlabels and identifier in self.charlabels.itervalues():
1138 for k in self.charlabels:
1139 if self.charlabels[k]==identifier:
1140 return k
1141 elif self.charsets and identifier in self.charsets:
1142 return self.charsets[identifier]
1143 else:
1144 raise NexusError('Unknown character identifier: %s' \
1145 % identifier)
1146 else:
1147 if n<=self.nchar:
1148 return n-1
1149 else:
1150 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \
1151 % (identifier,self.nchar))
1152 elif set_type==TAXSET:
1153 try:
1154 n=int(identifier)
1155 except ValueError:
1156 taxlabels_id=self._check_taxlabels(identifier)
1157 if taxlabels_id:
1158 return taxlabels_id
1159 elif self.taxsets and identifier in self.taxsets:
1160 return self.taxsets[identifier]
1161 else:
1162 raise NexusError('Unknown taxon identifier: %s' \
1163 % identifier)
1164 else:
1165 if n>0 and n<=self.ntax:
1166 return self.taxlabels[n-1]
1167 else:
1168 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \
1169 % (identifier,self.ntax))
1170 else:
1171 raise NexusError('Unknown set specification: %s.'% set_type)
1172
1176
1180
1184
1188
1189 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
1190 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1191 """Writes a nexus file for each partition in charpartition.
1192
1193 Only non-excluded characters and non-deleted taxa are included,
1194 just the data block is written.
1195 """
1196
1197 if not matrix:
1198 matrix=self.matrix
1199 if not matrix:
1200 return
1201 if not filename:
1202 filename=self.filename
1203 if charpartition:
1204 pfilenames={}
1205 for p in charpartition:
1206 total_exclude=[]+exclude
1207 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1208 total_exclude=_make_unique(total_exclude)
1209 pcomment=comment+'\nPartition: '+p+'\n'
1210 dot=filename.rfind('.')
1211 if dot>0:
1212 pfilename=filename[:dot]+'_'+p+'.data'
1213 else:
1214 pfilename=filename+'_'+p
1215 pfilenames[p]=pfilename
1216 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1217 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1218 mrbayes=mrbayes)
1219 return pfilenames
1220 else:
1221 fn=self.filename+'.data'
1222 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1223 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1224 mrbayes=mrbayes)
1225 return fn
1226
1227 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1228 blocksize=None, interleave=False, interleave_by_partition=False,\
1229 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
1230 codons_block=True):
1231 """Writes a nexus file with data and sets block to a file or handle.
1232
1233 Character sets and partitions are appended by default, and are
1234 adjusted according to excluded characters (i.e. character sets
1235 still point to the same sites (not necessarily same positions),
1236 without including the deleted characters.
1237
1238 filename - Either a filename as a string (which will be opened,
1239 written to and closed), or a handle object (which will
1240 be written to but NOT closed).
1241 interleave_by_partition - Optional name of partition (string)
1242 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the
1243 start of the file is ommited.
1244
1245 Returns the filename/handle used to write the data.
1246 """
1247 if not matrix:
1248 matrix=self.matrix
1249 if not matrix:
1250 return
1251 if not filename:
1252 filename=self.filename
1253 if [t for t in delete if not self._check_taxlabels(t)]:
1254 raise NexusError('Unknown taxa: %s' \
1255 % ', '.join(set(delete).difference(set(self.taxlabels))))
1256 if interleave_by_partition:
1257 if not interleave_by_partition in self.charpartitions:
1258 raise NexusError('Unknown partition: %r' % interleave_by_partition)
1259 else:
1260 partition=self.charpartitions[interleave_by_partition]
1261
1262 names=_sort_keys_by_values(partition)
1263 newpartition={}
1264 for p in partition:
1265 newpartition[p]=[c for c in partition[p] if c not in exclude]
1266
1267 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1268 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1269 ntax_adjusted=len(undelete)
1270 nchar_adjusted=len(cropped_matrix[undelete[0]])
1271 if not undelete or (undelete and undelete[0]==''):
1272 return
1273 if isinstance(filename,basestring):
1274 try:
1275 fh=open(filename,'w',encoding="utf-8")
1276 except IOError:
1277 raise NexusError('Could not open %s for writing.' % filename)
1278 elif hasattr(filename, 'write'):
1279
1280 fh=filename
1281 else:
1282 raise ValueError("Neither a filename nor a handle was supplied")
1283 if not omit_NEXUS:
1284 fh.write('#NEXUS\n')
1285 if comment:
1286 fh.write('['+comment+']\n')
1287 fh.write('begin data;\n')
1288 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1289 fh.write('\tformat datatype='+self.datatype)
1290 if self.respectcase:
1291 fh.write(' respectcase')
1292 if self.missing:
1293 fh.write(' missing='+self.missing)
1294 if self.gap:
1295 fh.write(' gap='+self.gap)
1296 if self.matchchar:
1297 fh.write(' matchchar='+self.matchchar)
1298 if self.labels:
1299 fh.write(' labels='+self.labels)
1300 if self.equate:
1301 fh.write(' equate='+self.equate)
1302 if interleave or interleave_by_partition:
1303 fh.write(' interleave')
1304 fh.write(';\n')
1305
1306
1307 if self.charlabels:
1308 newcharlabels=self._adjust_charlabels(exclude=exclude)
1309 clkeys=sorted(newcharlabels)
1310 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1311 fh.write('matrix\n')
1312 if not blocksize:
1313 if interleave:
1314 blocksize=70
1315 else:
1316 blocksize=self.nchar
1317
1318 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1319 if interleave_by_partition:
1320
1321 seek=0
1322 for p in names:
1323 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1324 if len(newpartition[p])>0:
1325 for taxon in undelete:
1326 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1327 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1328 fh.write('\n')
1329 else:
1330 fh.write('[empty]\n\n')
1331 seek+=len(newpartition[p])
1332 elif interleave:
1333 for seek in range(0,nchar_adjusted,blocksize):
1334 for taxon in undelete:
1335 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1336 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1337 fh.write('\n')
1338 else:
1339 for taxon in undelete:
1340 if blocksize<nchar_adjusted:
1341 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1342 else:
1343 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1344 for seek in range(0,nchar_adjusted,blocksize):
1345 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1346 fh.write(';\nend;\n')
1347 if append_sets:
1348 if codons_block:
1349 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
1350 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
1351 else:
1352 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1353
1354 if fh == filename:
1355
1356 return filename
1357 else:
1358
1359 fh.close()
1360 return filename
1361
1362 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1363 """Returns a sets block."""
1364 if not self.charsets and not self.taxsets and not self.charpartitions:
1365 return ''
1366 if codons_only:
1367 setsb=['\nbegin codons']
1368 else:
1369 setsb=['\nbegin sets']
1370
1371
1372
1373
1374 offset=0
1375 offlist=[]
1376 for c in range(self.nchar):
1377 if c in exclude:
1378 offset+=1
1379 offlist.append(-1)
1380 else:
1381 offlist.append(c-offset)
1382
1383 if not codons_only:
1384 for n,ns in self.charsets.iteritems():
1385 cset=[offlist[c] for c in ns if c not in exclude]
1386 if cset:
1387 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1388 for n,s in self.taxsets.iteritems():
1389 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1390 if tset:
1391 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1392 for n,p in self.charpartitions.iteritems():
1393 if not include_codons and n==CODONPOSITIONS:
1394 continue
1395 elif codons_only and n!=CODONPOSITIONS:
1396 continue
1397
1398
1399
1400 names=_sort_keys_by_values(p)
1401 newpartition={}
1402 for sn in names:
1403 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1404 if nsp:
1405 newpartition[sn]=nsp
1406 if newpartition:
1407 if include_codons and n==CODONPOSITIONS:
1408 command='codonposset'
1409 else:
1410 command='charpartition'
1411 setsb.append('%s %s = %s' % (command,safename(n),\
1412 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1413
1414 for n,p in self.taxpartitions.iteritems():
1415 names=_sort_keys_by_values(p)
1416 newpartition={}
1417 for sn in names:
1418 nsp=[t for t in p[sn] if t not in delete]
1419 if nsp:
1420 newpartition[sn]=nsp
1421 if newpartition:
1422 setsb.append('taxpartition %s = %s' % (safename(n),\
1423 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1424
1425 setsb.append('end;\n')
1426 if len(setsb)==2:
1427 return ''
1428 else:
1429 return ';\n'.join(setsb)
1430
1432 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1433 if not filename:
1434 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1435 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1436 else:
1437 filename=self.filename+'.fas'
1438 fh=open(filename,'w')
1439 for taxon in self.taxlabels:
1440 fh.write('>'+safename(taxon)+'\n')
1441 for i in range(0, len(self.matrix[taxon].tostring()), width):
1442 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1443 fh.close()
1444 return filename
1445
1447 """Writes matrix into a PHYLIP file: (self, filename=None).
1448
1449 Note that this writes a relaxed PHYLIP format file, where the names
1450 are not truncated, nor checked for invalid characters."""
1451 if not filename:
1452 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1453 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
1454 else:
1455 filename=self.filename+'.phy'
1456 fh=open(filename,'w')
1457 fh.write('%d %d\n' % (self.ntax,self.nchar))
1458 for taxon in self.taxlabels:
1459 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
1460 fh.close()
1461 return filename
1462
1463 - def constant(self,matrix=None,delete=[],exclude=[]):
1464 """Return a list with all constant characters."""
1465 if not matrix:
1466 matrix=self.matrix
1467 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1468 if not undelete:
1469 return None
1470 elif len(undelete)==1:
1471 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1472
1473 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1474 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1475 for taxon in undelete[1:]:
1476 newconstant=[]
1477 for site in constant:
1478
1479 seqsite=matrix[taxon][site[0]].upper()
1480
1481 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1482
1483 newconstant.append(site)
1484 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1485
1486 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1487 elif seqsite in self.ambiguous_values:
1488 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1]))
1489 if intersect:
1490 newconstant.append((site[0],''.join(intersect)))
1491
1492
1493
1494
1495
1496 constant=newconstant
1497 cpos=[s[0] for s in constant]
1498 return cpos
1499
1500 - def cstatus(self,site,delete=[],narrow=True):
1501 """Summarize character.
1502
1503 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1504 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1505 """
1506 undelete=[t for t in self.taxlabels if t not in delete]
1507 if not undelete:
1508 return None
1509 cstatus=[]
1510 for t in undelete:
1511 c=self.matrix[t][site].upper()
1512 if self.options.get('gapmode')=='missing' and c==self.gap:
1513 c=self.missing
1514 if narrow and c==self.missing:
1515 if c not in cstatus:
1516 cstatus.append(c)
1517 else:
1518 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1519 if self.missing in cstatus and narrow and len(cstatus)>1:
1520 cstatus=[c for c in cstatus if c!=self.missing]
1521 cstatus.sort()
1522 return cstatus
1523
1537
1538 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1539 """Return a matrix without deleted taxa and excluded characters."""
1540 if not matrix:
1541 matrix=self.matrix
1542 if [t for t in delete if not self._check_taxlabels(t)]:
1543 raise NexusError('Unknown taxa: %s' \
1544 % ', '.join(set(delete).difference(self.taxlabels)))
1545 if exclude!=[]:
1546 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1547 if not undelete:
1548 return {}
1549 m=[matrix[k].tostring() for k in undelete]
1550 zipped_m=zip(*m)
1551 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1552 if sitesm==[]:
1553 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1554 else:
1555 zipped_sitesm=zip(*sitesm)
1556 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1557 return dict(zip(undelete,m))
1558 else:
1559 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1560
1561 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1562 """Return a bootstrapped matrix."""
1563 if not matrix:
1564 matrix=self.matrix
1565 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)
1566 cm=self.crop_matrix(delete=delete,exclude=exclude)
1567 if not cm:
1568 return {}
1569 elif len(cm[cm.keys()[0]])==0:
1570 return cm
1571 undelete=[t for t in self.taxlabels if t in cm]
1572 if seqobjects:
1573 sitesm=zip(*[cm[t].tostring() for t in undelete])
1574 alphabet=matrix[matrix.keys()[0]].alphabet
1575 else:
1576 sitesm=zip(*[cm[t] for t in undelete])
1577 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1578 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1579 if seqobjects:
1580 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1581 return dict(zip(undelete,bootstrapseqs))
1582
1584 """Adds a sequence (string) to the matrix."""
1585
1586 if not name:
1587 raise NexusError('New sequence must have a name')
1588
1589 diff=self.nchar-len(sequence)
1590 if diff<0:
1591 self.insert_gap(self.nchar,-diff)
1592 elif diff>0:
1593 sequence+=self.missing*diff
1594
1595 if name in self.taxlabels:
1596 unique_name=_unique_label(self.taxlabels,name)
1597
1598 else:
1599 unique_name=name
1600
1601 assert unique_name not in self.matrix, "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
1602
1603 self.matrix[unique_name]=Seq(sequence,self.alphabet)
1604 self.ntax+=1
1605 self.taxlabels.append(unique_name)
1606 self.unaltered_taxlabels.append(name)
1607
1609 """Add a gap into the matrix and adjust charsets and partitions.
1610
1611 pos=0: first position
1612 pos=nchar: last position
1613 """
1614
1615 def _adjust(set,x,d,leftgreedy=False):
1616 """Adjusts chartacter sets if gaps are inserted, taking care of
1617 new gaps within a coherent character set."""
1618
1619
1620
1621 set.sort()
1622 addpos=0
1623 for i,c in enumerate(set):
1624 if c>=x:
1625 set[i]=c+d
1626
1627 if c==x:
1628 if leftgreedy or (i>0 and set[i-1]==c-1):
1629 addpos=i
1630 if addpos>0:
1631 set[addpos:addpos]=range(x,x+d)
1632 return set
1633
1634 if pos<0 or pos>self.nchar:
1635 raise NexusError('Illegal gap position: %d' % pos)
1636 if n==0:
1637 return
1638 if self.taxlabels:
1639
1640 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1641 else:
1642 sitesm=[]
1643 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1644
1645
1646 zipped=zip(*sitesm)
1647 mapped=map(''.join,zipped)
1648 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1649 self.matrix=dict(listed)
1650 self.nchar+=n
1651
1652 for i,s in self.charsets.iteritems():
1653 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1654 for p in self.charpartitions:
1655 for sp,s in self.charpartitions[p].iteritems():
1656 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1657
1658 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1659 return self.charlabels
1660
1662 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1663 if exclude and insert:
1664 raise NexusError('Can\'t exclude and insert at the same time')
1665 if not self.charlabels:
1666 return None
1667 labels=sorted(self.charlabels)
1668 newcharlabels={}
1669 if exclude:
1670 exclude.sort()
1671 exclude.append(sys.maxint)
1672 excount=0
1673 for c in labels:
1674 if not c in exclude:
1675 while c>exclude[excount]:
1676 excount+=1
1677 newcharlabels[c-excount]=self.charlabels[c]
1678 elif insert:
1679 insert.sort()
1680 insert.append(sys.maxint)
1681 icount=0
1682 for c in labels:
1683 while c>=insert[icount]:
1684 icount+=1
1685 newcharlabels[c+icount]=self.charlabels[c]
1686 else:
1687 return self.charlabels
1688 return newcharlabels
1689
1691 """Returns all character indices that are not in charlist."""
1692 return [c for c in range(self.nchar) if c not in charlist]
1693
1694 - def gaponly(self,include_missing=False):
1695 """Return gap-only sites."""
1696 gap=set(self.gap)
1697 if include_missing:
1698 gap.add(self.missing)
1699 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1700 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)]
1701 return gaponly
1702
1724
1725
1726 try:
1727 import cnexus
1728 except ImportError:
1733 else:
1735 decommented=cnexus.scanfile(file_contents)
1736
1737 if decommented=='[' or decommented==']':
1738 raise NexusError('Unmatched %s' % decommented)
1739
1740
1741
1742 commandlines=_adjust_lines(decommented.split(chr(7)))
1743 return commandlines
1744