1
2
3
4
5
6
7 """Tree class to handle phylogenetic trees.
8
9 Provides a set of methods to read and write newick-format tree descriptions,
10 get information about trees (monphyly of taxon sets, congruence between trees,
11 common ancestors,...) and to manipulate trees (reroot trees, split terminal
12 nodes).
13 """
14
15 import sys, random, copy
16 import Nodes
17
18 PRECISION_BRANCHLENGTH=6
19 PRECISION_SUPPORT=6
20 NODECOMMENT_START='[&'
21 NODECOMMENT_END=']'
22
24
26 """Stores tree-relevant data associated with nodes (e.g. branches or otus)."""
27 - def __init__(self,taxon=None,branchlength=0.0,support=None,comment=None):
28 self.taxon=taxon
29 self.branchlength=branchlength
30 self.support=support
31 self.comment=comment
32
33 -class Tree(Nodes.Chain):
34 """Represents a tree using a chain of nodes with on predecessor (=ancestor)
35 and multiple successors (=subclades).
36 """
37
38
39
40
41
42
43
44
45
46 - def __init__(self,tree=None,weight=1.0,rooted=False,name='',data=NodeData,values_are_support=False,max_support=1.0):
65
67 """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively."""
68
69
70 tree = tree.strip()
71 if tree.count('(')!=tree.count(')'):
72 raise TreeError('Parentheses do not match in (sub)tree: '+tree)
73 if tree.count('(')==0:
74
75 nodecomment=tree.find(NODECOMMENT_START)
76 colon=tree.find(':')
77 if colon==-1 and nodecomment==-1:
78 return [tree,[None]]
79 elif colon==-1 and nodecomment>-1:
80 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
81 elif colon>-1 and nodecomment==-1:
82 return [tree[:colon],self._get_values(tree[colon+1:])]
83 elif colon < nodecomment:
84 return [tree[:colon],self._get_values(tree[colon+1:])]
85 else:
86 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
87 else:
88 closing=tree.rfind(')')
89 val=self._get_values(tree[closing+1:])
90 if not val:
91 val=[None]
92 subtrees=[]
93 plevel=0
94 prev=1
95 for p in range(1,closing):
96 if tree[p]=='(':
97 plevel+=1
98 elif tree[p]==')':
99 plevel-=1
100 elif tree[p]==',' and plevel==0:
101 subtrees.append(tree[prev:p])
102 prev=p+1
103 subtrees.append(tree[prev:closing])
104 subclades=[self._parse(subtree) for subtree in subtrees]
105 return [subclades,val]
106
108 """Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree)."""
109 if parent_id is None:
110 raise TreeError('Need node_id to connect to.')
111 for st in tree:
112 nd=self.dataclass()
113 nd = self._add_nodedata(nd, st)
114 if type(st[0])==list:
115 sn=Nodes.Node(nd)
116 self.add(sn,parent_id)
117 self._add_subtree(sn.id,st[0])
118 else:
119 nd.taxon=st[0]
120 leaf=Nodes.Node(nd)
121 self.add(leaf,parent_id)
122
124 """Add data to the node parsed from the comments, taxon and support.
125 """
126 if isinstance(st[1][-1],str) and st[1][-1].startswith(NODECOMMENT_START):
127 nd.comment=st[1].pop(-1)
128
129 elif isinstance(st[1][0], str):
130 nd.taxon = st[1][0]
131 st[1] = st[1][1:]
132 if len(st)>1:
133 if len(st[1])>=2:
134 nd.support=st[1][0]
135 if st[1][1] is not None:
136 nd.branchlength=st[1][1]
137 elif len(st[1])==1:
138 if not self.__values_are_support:
139 if st[1][0] is not None:
140 nd.branchlength=st[1][0]
141 else:
142 nd.support=st[1][0]
143 return nd
144
175
176 - def _walk(self,node=None):
177 """Return all node_ids downwards from a node."""
178
179 if node is None:
180 node=self.root
181 for n in self.node(node).succ:
182 yield n
183 for sn in self._walk(n):
184 yield sn
185
186 - def node(self,node_id):
187 """Return the instance of node_id.
188
189 node = node(self,node_id)
190 """
191 if node_id not in self.chain:
192 raise TreeError('Unknown node_id: %d' % node_id)
193 return self.chain[node_id]
194
195 - def split(self,parent_id=None,n=2,branchlength=1.0):
196 """Speciation: generates n (default two) descendants of a node.
197
198 [new ids] = split(self,parent_id=None,n=2,branchlength=1.0):
199 """
200 if parent_id is None:
201 raise TreeError('Missing node_id.')
202 ids=[]
203 parent_data=self.chain[parent_id].data
204 for i in range(n):
205 node=Nodes.Node()
206 if parent_data:
207 node.data=self.dataclass()
208
209 if parent_data.taxon:
210 node.data.taxon=parent_data.taxon+str(i)
211 node.data.branchlength=branchlength
212 ids.append(self.add(node,parent_id))
213 return ids
214
216 """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes.
217
218 node_id = search_taxon(self,taxon)
219 """
220 for id,node in self.chain.iteritems():
221 if node.data.taxon==taxon:
222 return id
223 return None
224
226 """Prunes a terminal taxon from the tree.
227
228 id_of_previous_node = prune(self,taxon)
229 If taxon is from a bifurcation, the connectiong node will be collapsed
230 and its branchlength added to remaining terminal node. This might be no
231 longer a meaningful value'
232 """
233
234 id=self.search_taxon(taxon)
235 if id is None:
236 raise TreeError('Taxon not found: %s' % taxon)
237 elif id not in self.get_terminals():
238 raise TreeError('Not a terminal taxon: %s' % taxon)
239 else:
240 prev=self.unlink(id)
241 self.kill(id)
242 if len(self.node(prev).succ)==1:
243 if prev==self.root:
244 self.root=self.node(self.root).succ[0]
245 self.node(self.root).branchlength=0.0
246 self.kill(prev)
247 else:
248 succ=self.node(prev).succ[0]
249 new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength
250 self.collapse(prev)
251 self.node(succ).data.branchlength=new_bl
252 return prev
253
255 """Return a list of all otus downwards from a node (self, node_id).
256
257 nodes = get_taxa(self,node_id=None)
258 """
259
260 if node_id is None:
261 node_id=self.root
262 if node_id not in self.chain:
263 raise TreeError('Unknown node_id: %d.' % node_id)
264 if self.chain[node_id].succ==[]:
265 if self.chain[node_id].data:
266 return [self.chain[node_id].data.taxon]
267 else:
268 return None
269 else:
270 list=[]
271 for succ in self.chain[node_id].succ:
272 list.extend(self.get_taxa(succ))
273 return list
274
276 """Return a list of all terminal nodes."""
277 return [i for i in self.all_ids() if self.node(i).succ==[]]
278
280 """Returns True if node is a terminal node."""
281 return self.node(node).succ==[]
282
284 """Returns True if node is an internal node."""
285 return len(self.node(node).succ)>0
286
288 """Returns True if all successors of a node are terminal ones."""
289 if self.is_terminal(node):
290 return False not in [self.is_terminal(n) for n in self.node(node).succ]
291 else:
292 return False
294 """Counts the number of terminal nodes that are attached to a node."""
295 if node is None:
296 node=self.root
297 return len([n for n in self._walk(node) if self.is_terminal(n)])
298
300 """Collapses all subtrees which belong to the same genus (i.e share the same first word in their taxon name."""
301
302 while True:
303 for n in self._walk():
304 if self.is_terminal(n):
305 continue
306 taxa=self.get_taxa(n)
307 genera=[]
308 for t in taxa:
309 if space_equals_underscore:
310 t=t.replace(' ','_')
311 try:
312 genus=t.split('_',1)[0]
313 except:
314 genus='None'
315 if genus not in genera:
316 genera.append(genus)
317 if len(genera)==1:
318 self.node(n).data.taxon=genera[0]+' <collapsed>'
319
320 nodes2kill=[kn for kn in self._walk(node=n)]
321 for kn in nodes2kill:
322 self.kill(kn)
323 self.node(n).succ=[]
324 break
325 else:
326 break
327
328
330 """Adds up the branchlengths from root (default self.root) to node.
331
332 sum = sum_branchlength(self,root=None,node=None)
333 """
334
335 if root is None:
336 root=self.root
337 if node is None:
338 raise TreeError('Missing node id.')
339 blen=0.0
340 while node is not None and node is not root:
341 blen+=self.node(node).data.branchlength
342 node=self.node(node).prev
343 return blen
344
346 """Return subtree as a set of nested sets.
347
348 sets = set_subtree(self,node)
349 """
350
351 if self.node(node).succ==[]:
352 return self.node(node).data.taxon
353 else:
354 try:
355 return frozenset([self.set_subtree(n) for n in self.node(node).succ])
356 except:
357 print node
358 print self.node(node).succ
359 for n in self.node(node).succ:
360 print n, self.set_subtree(n)
361 print [self.set_subtree(n) for n in self.node(node).succ]
362 raise
363
365 """Compare tree and tree2 for identity.
366
367 result = is_identical(self,tree2)
368 """
369 return self.set_subtree(self.root)==tree2.set_subtree(tree2.root)
370
372 """Compares branches with support>threshold for compatibility.
373
374 result = is_compatible(self,tree2,threshold)
375 """
376
377
378 missing2=set(self.get_taxa())-set(tree2.get_taxa())
379 missing1=set(tree2.get_taxa())-set(self.get_taxa())
380 if strict and (missing1 or missing2):
381 if missing1:
382 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1) , self.name)
383 if missing2:
384 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2) , tree2.name)
385 raise TreeError('Can\'t compare trees with different taxon compositions.')
386 t1=[(set(self.get_taxa(n)),self.node(n).data.support) for n in self.all_ids() if \
387 self.node(n).succ and\
388 (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)]
389 t2=[(set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.all_ids() if \
390 tree2.node(n).succ and\
391 (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)]
392 conflict=[]
393 for (st1,sup1) in t1:
394 for (st2,sup2) in t2:
395 if not st1.issubset(st2) and not st2.issubset(st1):
396 intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2
397
398 if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)):
399 conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2))
400 return conflict
401
403 """Return the common ancestor that connects two nodes.
404
405 node_id = common_ancestor(self,node1,node2)
406 """
407
408 l1=[self.root]+self.trace(self.root,node1)
409 l2=[self.root]+self.trace(self.root,node2)
410 return [n for n in l1 if n in l2][-1]
411
412
414 """Add and return the sum of the branchlengths between two nodes.
415 dist = distance(self,node1,node2)
416 """
417
418 ca=self.common_ancestor(node1,node2)
419 return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2)
420
422 """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise.
423
424 result = is_monophyletic(self,taxon_list)
425 """
426 if isinstance(taxon_list,str):
427 taxon_set=set([taxon_list])
428 else:
429 taxon_set=set(taxon_list)
430 node_id=self.root
431 while 1:
432 subclade_taxa=set(self.get_taxa(node_id))
433 if subclade_taxa==taxon_set:
434 return node_id
435 else:
436 for subnode in self.chain[node_id].succ:
437 if set(self.get_taxa(subnode)).issuperset(taxon_set):
438 node_id=subnode
439 break
440 else:
441 return -1
442
457
459 """Move values stored in data.branchlength to data.support, and set branchlength to 0.0
460
461 This is necessary when support has been stored as branchlength (e.g. paup), and has thus
462 been read in as branchlength.
463 """
464
465 for n in self.chain:
466 self.node(n).data.support=self.node(n).data.branchlength
467 self.node(n).data.branchlength=0.0
468
470 """Convert absolute support (clade-count) to rel. frequencies.
471
472 Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of
473 calculating relative frequencies."""
474
475 for n in self._walk():
476 if self.node(n).data.support:
477 self.node(n).data.support/=float(nrep)
478
480 """Returns True if any of the nodes has data.support != None."""
481 for n in self._walk(node):
482 if self.node(n).data.support:
483 return True
484 else:
485 return False
486
487 - def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True):
488 """Generates a random tree with ntax taxa and/or taxa from taxlabels.
489
490 new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True)
491 Trees are bifurcating by default. (Polytomies not yet supported).
492 """
493
494 if not ntax and taxon_list:
495 ntax=len(taxon_list)
496 elif not taxon_list and ntax:
497 taxon_list=['taxon'+str(i+1) for i in range(ntax)]
498 elif not ntax and not taxon_list:
499 raise TreeError('Either numer of taxa or list of taxa must be specified.')
500 elif ntax != len(taxon_list):
501 raise TreeError('Length of taxon list must correspond to ntax.')
502
503 self.__init__()
504 terminals=self.get_terminals()
505
506 while len(terminals)<ntax:
507 newsplit=random.choice(terminals)
508 new_terminals=self.split(parent_id=newsplit,branchlength=branchlength)
509
510 if branchlength_sd:
511 for nt in new_terminals:
512 bl=random.gauss(branchlength,branchlength_sd)
513 if bl<0:
514 bl=0
515 self.node(nt).data.branchlength=bl
516 terminals.extend(new_terminals)
517 terminals.remove(newsplit)
518
519 random.shuffle(taxon_list)
520 for (node,name) in zip(terminals,taxon_list):
521 self.node(node).data.taxon=name
522
524 """Quick and dirty lists of all nodes."""
525 table=[('#','taxon','prev','succ','brlen','blen (sum)','support','comment')]
526
527 for i in sorted(self.all_ids()):
528 n=self.node(i)
529 if not n.data:
530 table.append((str(i),'-',str(n.prev),str(n.succ),'-','-','-','-'))
531 else:
532 tx=n.data.taxon
533 if not tx:
534 tx='-'
535 blength="%0.2f" % n.data.branchlength
536 if blength is None:
537 blength='-'
538 sum_blength='-'
539 else:
540 sum_blength="%0.2f" % self.sum_branchlength(node=i)
541 support=n.data.support
542 if support is None:
543 support='-'
544 else:
545 support="%0.2f" % support
546 comment=n.data.comment
547 if comment is None:
548 comment='-'
549 table.append((str(i),tx,str(n.prev),str(n.succ),
550 blength, sum_blength, support, comment))
551 print '\n'.join(['%3s %32s %15s %15s %8s %10s %8s %20s' % l for l in table])
552 print '\nRoot: ',self.root
553
554 - def to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True,plain_newick=False,ladderize=None,ignore_comments=True):
555 """Return a paup compatible tree line.
556
557 to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True)
558 """
559
560 if support_as_branchlengths or branchlengths_only:
561 plain=False
562 self.support_as_branchlengths=support_as_branchlengths
563 self.branchlengths_only=branchlengths_only
564 self.ignore_comments=ignore_comments
565 self.plain=plain
566
567 def make_info_string(data,terminal=False):
568 """Creates nicely formatted support/branchlengths."""
569
570 if self.plain:
571 info_string= ''
572 elif self.support_as_branchlengths:
573 if terminal:
574 info_string= ':%1.2f' % self.max_support
575 elif data.support:
576 info_string= ':%1.2f' % (data.support)
577 else:
578 info_string=':0.00'
579 elif self.branchlengths_only:
580 info_string= ':%1.5f' % (data.branchlength)
581 else:
582 if terminal:
583 info_string= ':%1.5f' % (data.branchlength)
584 else:
585 if data.branchlength is not None and data.support is not None:
586 info_string= '%1.2f:%1.5f' % (data.support,data.branchlength)
587 elif data.branchlength is not None:
588 info_string= '0.00000:%1.5f' % (data.branchlength)
589 elif data.support is not None:
590 info_string= '%1.2f:0.00000' % (data.support)
591 else:
592 info_string= '0.00:0.00000'
593 if not ignore_comments and hasattr(data,'nodecomment'):
594 info_string=str(data.nodecomment)+info_string
595 return info_string
596
597 def ladderize_nodes(nodes,ladderize=None):
598 """Sorts node numbers according to the number of terminal nodes."""
599 if ladderize in ['left','LEFT','right','RIGHT']:
600 succnode_terminals=[(self.count_terminals(node=n),n) for n in nodes]
601 succnode_terminals.sort()
602 if (ladderize=='right' or ladderize=='RIGHT'):
603 succnode_terminals.reverse()
604 if succnode_terminals:
605 succnodes=zip(*succnode_terminals)[1]
606 else:
607 succnodes=[]
608 else:
609 succnodes=nodes
610 return succnodes
611
612 def newickize(node,ladderize=None):
613 """Convert a node tree to a newick tree recursively."""
614
615 if not self.node(node).succ:
616 return self.node(node).data.taxon+make_info_string(self.node(node).data,terminal=True)
617 else:
618 succnodes=ladderize_nodes(self.node(node).succ,ladderize=ladderize)
619 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
620 return '(%s)%s' % (','.join(subtrees),make_info_string(self.node(node).data))
621
622 treeline=['tree']
623 if self.name:
624 treeline.append(self.name)
625 else:
626 treeline.append('a_tree')
627 treeline.append('=')
628 if self.weight != 1:
629 treeline.append('[&W%s]' % str(round(float(self.weight),3)))
630 if self.rooted:
631 treeline.append('[&R]')
632 succnodes=ladderize_nodes(self.node(self.root).succ)
633 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
634 treeline.append('(%s)' % ','.join(subtrees))
635 if plain_newick:
636 return treeline[-1]
637 else:
638 return ' '.join(treeline)+';'
639
641 """Short version of to_string(), gives plain tree"""
642 return self.to_string(plain=True)
643
645 """Defines a unrooted Tree structure, using data of a rooted Tree."""
646
647
648
649 def _get_branches(node):
650 branches=[]
651 for b in self.node(node).succ:
652 branches.append([node,b,self.node(b).data.branchlength,self.node(b).data.support])
653 branches.extend(_get_branches(b))
654 return branches
655
656 self.unrooted=_get_branches(self.root)
657
658 if len(self.node(self.root).succ)==2:
659
660 rootbranches=[b for b in self.unrooted if self.root in b[:2]]
661 b1=self.unrooted.pop(self.unrooted.index(rootbranches[0]))
662 b2=self.unrooted.pop(self.unrooted.index(rootbranches[1]))
663
664
665 newbranch=[b1[1],b2[1],b1[2]+b2[2]]
666 if b1[3] is None:
667 newbranch.append(b2[3])
668 elif b2[3] is None:
669 newbranch.append(b1[3])
670 elif b1[3]==b2[3]:
671 newbranch.append(b1[3])
672 elif b1[3]==0 or b2[3]==0:
673 newbranch.append(b1[3]+b2[3])
674 else:
675 raise TreeError('Support mismatch in bifurcating root: %f, %f' \
676 % (float(b1[3]),float(b2[3])))
677 self.unrooted.append(newbranch)
678
680
681 def _connect_subtree(parent,child):
682 """Hook subtree starting with node child to parent."""
683 for i,branch in enumerate(self.unrooted):
684 if parent in branch[:2] and child in branch[:2]:
685 branch=self.unrooted.pop(i)
686 break
687 else:
688 raise TreeError('Unable to connect nodes for rooting: nodes %d and %d are not connected' \
689 % (parent,child))
690 self.link(parent,child)
691 self.node(child).data.branchlength=branch[2]
692 self.node(child).data.support=branch[3]
693
694 child_branches=[b for b in self.unrooted if child in b[:2]]
695 for b in child_branches:
696 if child==b[0]:
697 succ=b[1]
698 else:
699 succ=b[0]
700 _connect_subtree(child,succ)
701
702
703 if outgroup is None:
704 return self.root
705 outgroup_node=self.is_monophyletic(outgroup)
706 if outgroup_node==-1:
707 return -1
708
709
710 if (len(self.node(self.root).succ)==2 and outgroup_node in self.node(self.root).succ) or outgroup_node==self.root:
711 return self.root
712
713 self.unroot()
714
715
716 for i,b in enumerate(self.unrooted):
717 if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]:
718 root_branch=self.unrooted.pop(i)
719 break
720 else:
721 raise TreeError('Unrooted and rooted Tree do not match')
722 if outgroup_node==root_branch[1]:
723 ingroup_node=root_branch[0]
724 else:
725 ingroup_node=root_branch[1]
726
727 for n in self.all_ids():
728 self.node(n).prev=None
729 self.node(n).succ=[]
730
731 root=Nodes.Node(data=NodeData())
732 self.add(root)
733 self.root=root.id
734 self.unrooted.append([root.id,ingroup_node,root_branch[2],root_branch[3]])
735 self.unrooted.append([root.id,outgroup_node,0.0,0.0])
736 _connect_subtree(root.id,ingroup_node)
737 _connect_subtree(root.id,outgroup_node)
738
739 oldroot=[i for i in self.all_ids() if self.node(i).prev is None and i!=self.root]
740 if len(oldroot)>1:
741 raise TreeError('Isolated nodes in tree description: %s' \
742 % ','.join(oldroot))
743 elif len(oldroot)==1:
744 self.kill(oldroot[0])
745 return self.root
746
748 """Merges clade support (from consensus or list of bootstrap-trees) with phylogeny.
749
750 tree=merge_bootstrap(phylo,bs_tree=<list_of_trees>)
751 or
752 tree=merge_bootstrap(phylo,consree=consensus_tree with clade support)
753 """
754
755 if bstrees and constree:
756 raise TreeError('Specify either list of boostrap trees or consensus tree, not both')
757 if not (bstrees or constree):
758 raise TreeError('Specify either list of boostrap trees or consensus tree.')
759
760 if outgroup is None:
761 try:
762 succnodes=self.node(self.root).succ
763 smallest=min([(len(self.get_taxa(n)),n) for n in succnodes])
764 outgroup=self.get_taxa(smallest[1])
765 except:
766 raise TreeError("Error determining outgroup.")
767 else:
768 self.root_with_outgroup(outgroup)
769
770 if bstrees:
771 constree=consensus(bstrees,threshold=threshold,outgroup=outgroup)
772 else:
773 if not constree.has_support():
774 constree.branchlength2support()
775 constree.root_with_outgroup(outgroup)
776
777 for pnode in self._walk():
778 cnode=constree.is_monophyletic(self.get_taxa(pnode))
779 if cnode>-1:
780 self.node(pnode).data.support=constree.node(cnode).data.support
781
782
783 -def consensus(trees, threshold=0.5,outgroup=None):
858