Package Bio :: Package Phylo :: Module NewickIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.NewickIO

  1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license. Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """I/O function wrappers for the Newick file format. 
  9   
 10  See: http://evolution.genetics.washington.edu/phylip/newick_doc.html 
 11  """ 
 12  __docformat__ = "restructuredtext en" 
 13   
 14  import warnings 
 15   
 16  from cStringIO import StringIO 
 17   
 18  from Bio import BiopythonDeprecationWarning 
 19  from Bio.Phylo import Newick 
 20   
 21  # Definitions retrieved from Bio.Nexus.Trees 
 22  NODECOMMENT_START = '[&' 
 23  NODECOMMENT_END = ']' 
24 25 26 -class NewickError(Exception):
27 """Exception raised when Newick object construction cannot continue.""" 28 pass
29
30 31 # --------------------------------------------------------- 32 # Public API 33 34 -def parse(handle, **kwargs):
35 """Iterate over the trees in a Newick file handle. 36 37 :returns: generator of Bio.Phylo.Newick.Tree objects. 38 """ 39 return Parser(handle).parse(**kwargs)
40
41 -def write(trees, handle, plain=False, **kwargs):
42 """Write a trees in Newick format to the given file handle. 43 44 :returns: number of trees written. 45 """ 46 return Writer(trees).write(handle, plain=plain, **kwargs)
47
48 49 # --------------------------------------------------------- 50 # Input 51 52 -class Parser(object):
53 """Parse a Newick tree given a file handle. 54 55 Based on the parser in `Bio.Nexus.Trees`. 56 """ 57
58 - def __init__(self, handle):
59 self.handle = handle
60 61 @classmethod
62 - def from_string(cls, treetext):
63 handle = StringIO(treetext) 64 return cls(handle)
65
66 - def parse(self, values_are_confidence=False, rooted=False, 67 # XXX Deprecated kwarg -- remove after Biopython 1.58 68 values_are_support=None):
69 """Parse the text stream this object was initialized with.""" 70 # XXX Handling the deprecated kwarg -- remove after Biopython 1.58 71 if values_are_support is not None: 72 warnings.warn("use the argument values_are_confidence instead", 73 BiopythonDeprecationWarning) 74 values_are_confidence = values_are_support 75 self.values_are_confidence = values_are_confidence 76 self.rooted = rooted # XXX this attribue is useless 77 buf = '' 78 for line in self.handle: 79 buf += line.rstrip() 80 if buf.endswith(';'): 81 yield self._parse_tree(buf, rooted) 82 buf = '' 83 if buf: 84 # Last tree is missing a terminal ';' character -- that's OK 85 yield self._parse_tree(buf, rooted)
86
87 - def _parse_tree(self, text, rooted):
88 """Parses the text representation into an Tree object.""" 89 # XXX Pass **kwargs along from Parser.parse? 90 return Newick.Tree(root=self._parse_subtree(text), rooted=self.rooted)
91
92 - def _parse_subtree(self, text):
93 """Parse ``(a,b,c...)[[[xx]:]yy]`` into subcomponents, recursively.""" 94 text = text.strip().rstrip(';') 95 if text.count('(')!=text.count(')'): 96 raise NewickError("Parentheses do not match in (sub)tree: " + text) 97 # Text is now "(...)..." (balanced parens) or "..." (leaf node) 98 if text.count('(') == 0: 99 # Leaf/terminal node -- recursion stops here 100 return self._parse_tag(text) 101 # Handle one layer of the nested subtree 102 # XXX what if there's a paren in a comment or other string? 103 close_posn = text.rfind(')') 104 subtrees = [] 105 # Locate subtrees by counting nesting levels of parens 106 plevel = 0 107 prev = 1 108 for posn in range(1, close_posn): 109 if text[posn] == '(': 110 plevel += 1 111 elif text[posn] == ')': 112 plevel -= 1 113 elif text[posn] == ',' and plevel == 0: 114 subtrees.append(text[prev:posn]) 115 prev = posn + 1 116 subtrees.append(text[prev:close_posn]) 117 # Construct a new clade from trailing text, then attach subclades 118 clade = self._parse_tag(text[close_posn+1:]) 119 clade.clades = [self._parse_subtree(st) for st in subtrees] 120 return clade
121
122 - def _parse_tag(self, text):
123 """Extract the data for a node from text. 124 125 :returns: Clade instance containing any available data 126 """ 127 # Extract the comment 128 comment_start = text.find(NODECOMMENT_START) 129 if comment_start != -1: 130 comment_end = text.find(NODECOMMENT_END) 131 if comment_end == -1: 132 raise NewickError('Error in tree description: ' 133 'Found %s without matching %s' 134 % (NODECOMMENT_START, NODECOMMENT_END)) 135 comment = text[comment_start+len(NODECOMMENT_START):comment_end] 136 text = text[:comment_start] + text[comment_end+len(NODECOMMENT_END):] 137 else: 138 comment = None 139 clade = Newick.Clade(comment=comment) 140 # Extract name (taxon), and optionally support, branch length 141 # Float values are support and branch length, the string is name/taxon 142 values = [] 143 for part in (t.strip() for t in text.split(':')): 144 if part: 145 try: 146 values.append(float(part)) 147 except ValueError: 148 assert clade.name is None, "Two string taxonomies?" 149 clade.name = part 150 if len(values) == 1: 151 # Real branch length, or support as branch length 152 if self.values_are_confidence: 153 clade.confidence = values[0] 154 else: 155 clade.branch_length = values[0] 156 elif len(values) == 2: 157 # Two non-taxon values: support comes first. (Is that always so?) 158 clade.confidence, clade.branch_length = values 159 elif len(values) > 2: 160 raise NewickError("Too many colons in tag: " + text) 161 return clade
162
163 164 # --------------------------------------------------------- 165 # Output 166 167 -class Writer(object):
168 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 169
170 - def __init__(self, trees):
171 self.trees = trees
172
173 - def write(self, handle, **kwargs):
174 """Write this instance's trees to a file handle.""" 175 count = 0 176 for treestr in self.to_strings(**kwargs): 177 handle.write(treestr + '\n') 178 count += 1 179 return count
180
181 - def to_strings(self, confidence_as_branch_length=False, 182 branch_length_only=False, plain=False, 183 plain_newick=True, ladderize=None, max_confidence=1.0, 184 format_confidence='%1.2f', format_branch_length='%1.5f', 185 # XXX Deprecated kwargs -- remove after Biopython 1.58 186 support_as_branchlengths=None, branchlengths_only=None, 187 max_support=None):
188 """Return an iterable of PAUP-compatible tree lines.""" 189 # XXX Handling the deprecated kwargs -- remove after Biopython 1.58 190 if support_as_branchlengths is not None: 191 warnings.warn( 192 "use the argument confidence_as_branch_length instead", 193 BiopythonDeprecationWarning) 194 confidence_as_branch_length = support_as_branchlengths 195 if branchlengths_only is not None: 196 warnings.warn("use the argument branch_length_only instead", 197 BiopythonDeprecationWarning) 198 branch_length_only = branchlengths_only 199 if max_support is not None: 200 warnings.warn("use the argument max_confidence instead", 201 BiopythonDeprecationWarning) 202 max_confidence = max_support 203 # If there's a conflict in the arguments, we override plain=True 204 if confidence_as_branch_length or branch_length_only: 205 plain = False 206 make_info_string = self._info_factory(plain, 207 confidence_as_branch_length, branch_length_only, max_confidence, 208 format_confidence, format_branch_length) 209 def newickize(clade): 210 """Convert a node tree to a Newick tree string, recursively.""" 211 if clade.is_terminal(): #terminal 212 return ((clade.name or '') 213 + make_info_string(clade, terminal=True)) 214 else: 215 subtrees = (newickize(sub) for sub in clade) 216 return '(%s)%s' % (','.join(subtrees), 217 make_info_string(clade))
218 219 # Convert each tree to a string 220 for tree in self.trees: 221 if ladderize in ('left', 'LEFT', 'right', 'RIGHT'): 222 # Nexus compatibility shim, kind of 223 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT'))) 224 rawtree = newickize(tree.root) + ';' 225 if plain_newick: 226 yield rawtree 227 continue 228 # Nexus-style (?) notation before the raw Newick tree 229 treeline = ['tree', (tree.name or 'a_tree'), '='] 230 if tree.weight != 1: 231 treeline.append('[&W%s]' % round(float(tree.weight), 3)) 232 if tree.rooted: 233 treeline.append('[&R]') 234 treeline.append(rawtree) 235 yield ' '.join(treeline)
236
237 - def _info_factory(self, plain, confidence_as_branch_length, 238 branch_length_only, max_confidence, format_confidence, 239 format_branch_length):
240 """Return a function that creates a nicely formatted node tag.""" 241 if plain: 242 # Plain tree only. That's easy. 243 def make_info_string(clade, terminal=False): 244 return ''
245 246 elif confidence_as_branch_length: 247 # Support as branchlengths (eg. PAUP), ignore actual branchlengths 248 def make_info_string(clade, terminal=False): 249 if terminal: 250 # terminal branches have 100% support 251 return ':' + format_confidence % max_confidence 252 else: 253 return ':' + format_confidence % clade.confidence 254 255 elif branch_length_only: 256 # write only branchlengths, ignore support 257 def make_info_string(clade, terminal=False): 258 return ':' + format_branch_length % clade.branch_length 259 260 else: 261 # write support and branchlengths (e.g. .con tree of mrbayes) 262 def make_info_string(clade, terminal=False): 263 if (terminal or 264 not hasattr(clade, 'confidence') or 265 clade.confidence is None): 266 return (':' + format_branch_length 267 ) % (clade.branch_length or 0.0) 268 else: 269 return (format_confidence + ':' + format_branch_length 270 ) % (clade.confidence, clade.branch_length or 0.0) 271 272 return make_info_string 273