Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/  
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14  """ 
  15  __docformat__ = "restructuredtext en" 
  16   
  17  import re 
  18  import warnings 
  19   
  20  from Bio import Alphabet 
  21  from Bio.Align import MultipleSeqAlignment 
  22  from Bio.Seq import Seq 
  23  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  24  from Bio.SeqRecord import SeqRecord 
  25  from Bio import BiopythonWarning 
  26   
  27  from Bio.Phylo import BaseTree 
  28   
  29  #TODO - Remove this hack for Python 2.4 
  30  try: 
  31      any 
  32  except NameError: 
33 - def any(iterable):
34 for element in iterable: 35 if element: 36 return True 37 return False
38
39 -class PhyloXMLWarning(BiopythonWarning):
40 """Warning for non-compliance with the phyloXML specification.""" 41 pass
42
43 44 -def _check_str(text, testfunc):
45 """Check a string using testfunc, and warn if there's no match.""" 46 if text is not None and not testfunc(text): 47 warnings.warn("String %s doesn't match the given regexp" % text, 48 PhyloXMLWarning, stacklevel=2)
49
50 51 # Core elements 52 53 -class PhyloElement(BaseTree.TreeElement):
54 """Base class for all PhyloXML objects."""
55
56 57 -class Phyloxml(PhyloElement):
58 """Root node of the PhyloXML document. 59 60 Contains an arbitrary number of Phylogeny elements, possibly followed by 61 elements from other namespaces. 62 63 :Parameters: 64 attributes 65 (XML namespace definitions) 66 phylogenies 67 list of phylogenetic trees 68 other 69 list of arbitrary non-phyloXML elements, if any 70 """
71 - def __init__(self, attributes, phylogenies=None, other=None):
72 self.attributes = attributes 73 self.phylogenies = phylogenies or [] 74 self.other = other or []
75
76 - def __getitem__(self, index):
77 """Get a phylogeny by index or name.""" 78 if isinstance(index, int) or isinstance(index, slice): 79 return self.phylogenies[index] 80 if not isinstance(index, basestring): 81 raise KeyError("can't use %s as an index" % type(index)) 82 for tree in self.phylogenies: 83 if tree.name == index: 84 return tree 85 else: 86 raise KeyError("no phylogeny found with name " + repr(index))
87
88 - def __iter__(self):
89 """Iterate through the phylogenetic trees in this object.""" 90 return iter(self.phylogenies)
91
92 - def __len__(self):
93 """Number of phylogenetic trees in this object.""" 94 return len(self.phylogenies)
95
96 - def __str__(self):
97 return '%s([%s])' % (self.__class__.__name__, 98 ',\n'.join(map(str, self.phylogenies)))
99
100 101 -class Other(PhyloElement):
102 """Container for non-phyloXML elements in the tree. 103 104 Usually, an Other object will have either a 'value' or a non-empty list 105 of 'children', but not both. This is not enforced here, though. 106 107 :Parameters: 108 tag : string 109 local tag for the XML node 110 namespace : string 111 XML namespace for the node -- should not be the default phyloXML 112 namespace. 113 attributes : dict of strings 114 attributes on the XML node 115 value : string 116 text contained directly within this XML node 117 children : list 118 child nodes, if any (also `Other` instances) 119 """
120 - def __init__(self, tag, namespace=None, attributes=None, value=None, 121 children=None):
122 self.tag = tag 123 self.namespace = namespace 124 self.attributes = attributes 125 self.value = value 126 self.children = children or []
127
128 - def __iter__(self):
129 """Iterate through the children of this object (if any).""" 130 return iter(self.children)
131
132 133 -class Phylogeny(PhyloElement, BaseTree.Tree):
134 """A phylogenetic tree. 135 136 :Parameters: 137 root : Clade 138 the root node/clade of this tree 139 rooted : bool 140 True if this tree is rooted 141 rerootable : bool 142 True if this tree is rerootable 143 branch_length_unit : string 144 unit for branch_length values on clades 145 name : string 146 identifier for this tree, not required to be unique 147 id : Id 148 unique identifier for this tree 149 description : string 150 plain-text description 151 date : Date 152 date for the root node of this tree 153 confidences : list 154 Confidence objects for this tree 155 clade_relations : list 156 CladeRelation objects 157 sequence_relations : list 158 SequenceRelation objects 159 properties : list 160 Property objects 161 other : list 162 non-phyloXML elements (type `Other`) 163 """
164 - def __init__(self, root=None, rooted=True, 165 rerootable=None, branch_length_unit=None, type=None, 166 # Child nodes 167 name=None, id=None, description=None, date=None, 168 # Collections 169 confidences=None, clade_relations=None, sequence_relations=None, 170 properties=None, other=None, 171 ):
172 assert isinstance(rooted, bool) 173 self.root = root 174 self.rooted = rooted 175 self.rerootable = rerootable 176 self.branch_length_unit = branch_length_unit 177 self.type = type 178 self.name = name 179 self.id = id 180 self.description = description 181 self.date = date 182 self.confidences = confidences or [] 183 self.clade_relations = clade_relations or [] 184 self.sequence_relations = sequence_relations or [] 185 self.properties = properties or [] 186 self.other = other or []
187 188 @classmethod
189 - def from_tree(cls, tree, **kwargs):
190 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 191 192 Keyword arguments are the usual `Phylogeny` constructor parameters. 193 """ 194 phy = cls( 195 root=Clade.from_clade(tree.root), 196 rooted=tree.rooted, 197 name=tree.name, 198 id=(tree.id is not None) and Id(str(tree.id)) or None) 199 phy.__dict__.update(kwargs) 200 return phy
201 202 @classmethod
203 - def from_clade(cls, clade, **kwargs):
204 """Create a new Phylogeny given a Newick or BaseTree Clade object. 205 206 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 207 """ 208 return Clade.from_clade(clade).to_phylogeny(**kwargs)
209
210 - def as_phyloxml(self):
211 """Return this tree, a PhyloXML-compatible Phylogeny object. 212 213 Overrides the `BaseTree` method. 214 """ 215 return self
216
217 - def to_phyloxml_container(self, **kwargs):
218 """Create a new Phyloxml object containing just this phylogeny.""" 219 return Phyloxml(kwargs, phylogenies=[self])
220
221 - def to_alignment(self):
222 """Construct an alignment from the aligned sequences in this tree.""" 223 def is_aligned_seq(elem): 224 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 225 return True 226 return False
227 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 228 try: 229 first_seq = seqs.next() 230 except StopIteration: 231 # No aligned sequences were found --> empty MSA 232 return MultipleSeqAlignment([]) 233 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 234 first_seq.get_alphabet()) 235 msa.extend(seq.to_seqrecord() for seq in seqs) 236 return msa
237 238 # Singular property for plural attribute
239 - def _get_confidence(self):
240 """Equivalent to self.confidences[0] if there is only 1 value. 241 242 See also: `Clade.confidence`, `Clade.taxonomy` 243 """ 244 if len(self.confidences) == 0: 245 return None 246 if len(self.confidences) > 1: 247 raise AttributeError("more than 1 confidence value available; " 248 "use Phylogeny.confidences") 249 return self.confidences[0]
250
251 - def _set_confidence(self, value):
252 if value is None: 253 # Special case: mirror the behavior of _get_confidence 254 self.confidences = [] 255 return 256 if isinstance(value, float) or isinstance(value, int): 257 value = Confidence(value) 258 elif not isinstance(value, Confidence): 259 raise ValueError("value must be a number or Confidence instance") 260 if len(self.confidences) == 0: 261 self.confidences.append(value) 262 elif len(self.confidences) == 1: 263 self.confidences[0] = value 264 else: 265 raise ValueError("multiple confidence values already exist; " 266 "use Phylogeny.confidences instead")
267
268 - def _del_confidence(self):
269 self.confidences = []
270 271 confidence = property(_get_confidence, _set_confidence, _del_confidence) 272
273 274 -class Clade(PhyloElement, BaseTree.Clade):
275 """Describes a branch of the current phylogenetic tree. 276 277 Used recursively, describes the topology of a phylogenetic tree. 278 279 Both ``color`` and ``width`` elements should be interpreted by client code 280 as applying to the whole clade, including all descendents, unless 281 overwritten in-sub clades. This module doesn't automatically assign these 282 attributes to sub-clades to achieve this cascade -- and neither should you. 283 284 :Parameters: 285 branch_length 286 parent branch length of this clade 287 id_source 288 link other elements to a clade (on the xml-level) 289 name : string 290 short label for this clade 291 confidences : list of Confidence objects 292 used to indicate the support for a clade/parent branch. 293 width : float 294 branch width for this clade (including branch from parent) 295 color : BranchColor 296 color used for graphical display of this clade 297 node_id 298 unique identifier for the root node of this clade 299 taxonomies : list 300 Taxonomy objects 301 sequences : list 302 Sequence objects 303 events : Events 304 describe such events as gene-duplications at the root node/parent 305 branch of this clade 306 binary_characters : BinaryCharacters 307 binary characters 308 distributions : list of Distribution objects 309 distribution(s) of this clade 310 date : Date 311 a date for the root node of this clade 312 references : list 313 Reference objects 314 properties : list 315 Property objects 316 clades : list Clade objects 317 Sub-clades 318 other : list of Other objects 319 non-phyloXML objects 320 """
321 - def __init__(self, 322 # Attributes 323 branch_length=None, id_source=None, 324 # Child nodes 325 name=None, width=None, color=None, node_id=None, events=None, 326 binary_characters=None, date=None, 327 # Collections 328 confidences=None, taxonomies=None, sequences=None, 329 distributions=None, references=None, properties=None, clades=None, 330 other=None, 331 ):
332 self.branch_length = branch_length 333 self.id_source = id_source 334 self.name = name 335 self.width = width 336 self.color = color 337 self.node_id = node_id 338 self.events = events 339 self.binary_characters = binary_characters 340 self.date = date 341 self.confidences = confidences or [] 342 self.taxonomies = taxonomies or [] 343 self.sequences = sequences or [] 344 self.distributions = distributions or [] 345 self.references = references or [] 346 self.properties = properties or [] 347 self.clades = clades or [] 348 self.other = other or []
349 350 @classmethod
351 - def from_clade(cls, clade, **kwargs):
352 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 353 354 Keyword arguments are the usual PhyloXML Clade constructor parameters. 355 """ 356 new_clade = cls(branch_length=clade.branch_length, 357 name=clade.name) 358 new_clade.clades = [cls.from_clade(c) for c in clade] 359 new_clade.confidence = clade.confidence 360 new_clade.__dict__.update(kwargs) 361 return new_clade
362
363 - def to_phylogeny(self, **kwargs):
364 """Create a new phylogeny containing just this clade.""" 365 phy = Phylogeny(root=self, date=self.date) 366 phy.__dict__.update(kwargs) 367 return phy
368 369 # Shortcuts for list attributes that are usually only 1 item 370 # NB: Duplicated from Phylogeny class
371 - def _get_confidence(self):
372 if len(self.confidences) == 0: 373 return None 374 if len(self.confidences) > 1: 375 raise AttributeError("more than 1 confidence value available; " 376 "use Clade.confidences") 377 return self.confidences[0]
378
379 - def _set_confidence(self, value):
380 if value is None: 381 # Special case: mirror the behavior of _get_confidence 382 self.confidences = [] 383 return 384 if isinstance(value, float) or isinstance(value, int): 385 value = Confidence(value) 386 elif not isinstance(value, Confidence): 387 raise ValueError("value must be a number or Confidence instance") 388 if len(self.confidences) == 0: 389 self.confidences.append(value) 390 elif len(self.confidences) == 1: 391 self.confidences[0] = value 392 else: 393 raise ValueError("multiple confidence values already exist; " 394 "use Phylogeny.confidences instead")
395
396 - def _del_confidence(self):
397 self.confidences = []
398 399 confidence = property(_get_confidence, _set_confidence, _del_confidence) 400
401 - def _get_taxonomy(self):
402 if len(self.taxonomies) == 0: 403 return None 404 if len(self.taxonomies) > 1: 405 raise AttributeError("more than 1 taxonomy value available; " 406 "use Clade.taxonomies") 407 return self.taxonomies[0]
408
409 - def _set_taxonomy(self, value):
410 if not isinstance(value, Taxonomy): 411 raise ValueError("assigned value must be a Taxonomy instance") 412 if len(self.taxonomies) == 0: 413 self.taxonomies.append(value) 414 elif len(self.taxonomies) == 1: 415 self.taxonomies[0] = value 416 else: 417 raise ValueError("multiple taxonomy values already exist; " 418 "use Phylogeny.taxonomies instead")
419 420 taxonomy = property(_get_taxonomy, _set_taxonomy) 421 422 # Syntax sugar for setting the branch color
423 - def _get_color(self):
424 return self._color
425
426 - def _set_color(self, arg):
427 if arg is None or isinstance(arg, BranchColor): 428 self._color = arg 429 elif isinstance(arg, basestring): 430 if arg in BranchColor.color_names: 431 # Known color name 432 self._color = BranchColor.from_name(arg) 433 elif arg.startswith('#') and len(arg) == 7: 434 # HTML-style hex string 435 self._color = BranchColor.from_hex(arg) 436 else: 437 raise ValueError("invalid color string %s" % arg) 438 elif hasattr(arg, '__iter__') and len(arg) == 3: 439 # RGB triplet 440 self._color = BranchColor(*arg) 441 else: 442 raise ValueError("invalid color value %s" % arg)
443 444 color = property(_get_color, _set_color, doc="Branch color.")
445
446 447 # PhyloXML-specific complex types 448 449 -class Accession(PhyloElement):
450 """Captures the local part in a sequence identifier. 451 452 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 453 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 454 """
455 - def __init__(self, value, source):
456 self.value = value 457 self.source = source
458
459 - def __str__(self):
460 """Show the class name and an identifying attribute.""" 461 return '%s:%s' % (self.source, self.value)
462
463 464 -class Annotation(PhyloElement):
465 """The annotation of a molecular sequence. 466 467 It is recommended to annotate by using the optional 'ref' attribute. 468 469 :Parameters: 470 ref : string 471 reference string, e.g. 'GO:0008270', 472 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 473 source : string 474 plain-text source for this annotation 475 evidence : str 476 describe evidence as free text (e.g. 'experimental') 477 desc : string 478 free text description 479 confidence : Confidence 480 state the type and value of support (type Confidence) 481 properties : list 482 typed and referenced annotations from external resources 483 uri : Uri 484 link 485 """ 486 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 487
488 - def __init__(self, 489 # Attributes 490 ref=None, source=None, evidence=None, type=None, 491 # Child nodes 492 desc=None, confidence=None, uri=None, 493 # Collection 494 properties=None):
495 _check_str(ref, self.re_ref.match) 496 self.ref = ref 497 self.source = source 498 self.evidence = evidence 499 self.type = type 500 self.desc = desc 501 self.confidence = confidence 502 self.uri = uri 503 self.properties = properties or []
504
505 506 -class BinaryCharacters(PhyloElement):
507 """The names and/or counts of binary characters present, gained, and lost 508 at the root of a clade. 509 """
510 - def __init__(self, 511 # Attributes 512 type=None, gained_count=None, lost_count=None, present_count=None, 513 absent_count=None, 514 # Child nodes (flattened into collections) 515 gained=None, lost=None, present=None, absent=None):
516 self.type=type 517 self.gained_count=gained_count 518 self.lost_count=lost_count 519 self.present_count=present_count 520 self.absent_count=absent_count 521 self.gained=gained or [] 522 self.lost=lost or [] 523 self.present=present or [] 524 self.absent=absent or []
525
526 527 -class BranchColor(PhyloElement):
528 """Indicates the color of a clade when rendered graphically. 529 530 The color should be interpreted by client code (e.g. visualization 531 programs) as applying to the whole clade, unless overwritten by the 532 color(s) of sub-clades. 533 534 Color values must be integers from 0 to 255. 535 """ 536 537 color_names = { 538 'red': (255, 0, 0), 539 'r': (255, 0, 0), 540 'yellow': (255, 255, 0), 541 'y': (255, 255, 0), 542 'green': ( 0, 128, 0), 543 'g': ( 0, 128, 0), 544 'cyan': ( 0, 255, 255), 545 'c': ( 0, 255, 255), 546 'blue': ( 0, 0, 255), 547 'b': ( 0, 0, 255), 548 'magenta': (255, 0, 255), 549 'm': (255, 0, 255), 550 'black': ( 0, 0, 0), 551 'k': ( 0, 0, 0), 552 'white': (255, 255, 255), 553 'w': (255, 255, 255), 554 # Names standardized in HTML/CSS spec 555 # http://w3schools.com/html/html_colornames.asp 556 'maroon': (128, 0, 0), 557 'olive': (128, 128, 0), 558 'lime': ( 0, 255, 0), 559 'aqua': ( 0, 255, 255), 560 'teal': ( 0, 128, 128), 561 'navy': ( 0, 0, 128), 562 'fuchsia': (255, 0, 255), 563 'purple': (128, 0, 128), 564 'silver': (192, 192, 192), 565 'gray': (128, 128, 128), 566 # More definitions from matplotlib/gcolor2 567 'grey': (128, 128, 128), 568 'pink': (255, 192, 203), 569 'salmon': (250, 128, 114), 570 'orange': (255, 165, 0), 571 'gold': (255, 215, 0), 572 'tan': (210, 180, 140), 573 'brown': (165, 42, 42), 574 } 575
576 - def __init__(self, red, green, blue):
577 for color in (red, green, blue): 578 assert (isinstance(color, int) and 579 0 <= color <= 255 580 ), "Color values must be integers between 0 and 255." 581 self.red = red 582 self.green = green 583 self.blue = blue
584 585 @classmethod
586 - def from_hex(cls, hexstr):
587 """Construct a BranchColor object from a hexadecimal string. 588 589 The string format is the same style used in HTML and CSS, such as 590 '#FF8000' for an RGB value of (255, 128, 0). 591 """ 592 assert (isinstance(hexstr, basestring) and 593 hexstr.startswith('#') and 594 len(hexstr) == 7 595 ), "need a 24-bit hexadecimal string, e.g. #000000" 596 def unpack(cc): 597 return int('0x'+cc, base=16)
598 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] 599 return cls(*map(unpack, RGB))
600 601 @classmethod
602 - def from_name(cls, colorname):
603 """Construct a BranchColor object by the color's name.""" 604 return cls(*cls.color_names[colorname])
605
606 - def to_hex(self):
607 """Return a 24-bit hexadecimal RGB representation of this color. 608 609 The returned string is suitable for use in HTML/CSS, as a color 610 parameter in matplotlib, and perhaps other situations. 611 612 Example: 613 614 >>> bc = BranchColor(12, 200, 100) 615 >>> bc.to_hex() 616 '#0cc864' 617 """ 618 return '#' + hex( 619 self.red * (16**4) 620 + self.green * (16**2) 621 + self.blue)[2:].zfill(6)
622
623 - def to_rgb(self):
624 """Return a tuple of RGB values (0 to 255) representing this color. 625 626 Example: 627 628 >>> bc = BranchColor(255, 165, 0) 629 >>> bc.to_rgb() 630 (255, 165, 0) 631 """ 632 return (self.red, self.green, self.blue)
633
634 - def __repr__(self):
635 """Preserve the standard RGB order when representing this object.""" 636 return (u'%s(red=%d, green=%d, blue=%d)' 637 % (self.__class__.__name__, self.red, self.green, self.blue))
638
639 - def __str__(self):
640 """Show the color's RGB values.""" 641 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
642
643 644 -class CladeRelation(PhyloElement):
645 """Expresses a typed relationship between two clades. 646 647 For example, this could be used to describe multiple parents of a clade. 648 649 @type id_ref_0: str 650 @type id_ref_1: str 651 @type distance: str 652 @type type: str 653 654 @type confidence: Confidence 655 """
656 - def __init__(self, type, id_ref_0, id_ref_1, 657 distance=None, confidence=None):
658 self.distance = distance 659 self.type = type 660 self.id_ref_0 = id_ref_0 661 self.id_ref_1 = id_ref_1 662 self.confidence = confidence
663
664 665 -class Confidence(PhyloElement):
666 """A general purpose confidence element. 667 668 For example, this can be used to express the bootstrap support value of a 669 clade (in which case the `type` attribute is 'bootstrap'). 670 671 :Parameters: 672 value : float 673 confidence value 674 type : string 675 label for the type of confidence, e.g. 'bootstrap' 676 """
677 - def __init__(self, value, type='unknown'):
678 self.value = value 679 self.type = type
680
681 - def __float__(self):
682 return float(self.value)
683
684 - def __int__(self):
685 return int(self.value)
686
687 688 -class Date(PhyloElement):
689 """A date associated with a clade/node. 690 691 Its value can be numerical by using the 'value' element and/or free text 692 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 693 is recommended to employ the 'unit' attribute. 694 695 :Parameters: 696 unit : string 697 type of numerical value (e.g. 'mya' for 'million years ago') 698 value : float 699 the date value 700 desc : string 701 plain-text description of the date 702 minimum : float 703 lower bound on the date value 704 maximum : float 705 upper bound on the date value 706 """
707 - def __init__(self, value=None, unit=None, desc=None, 708 minimum=None, maximum=None):
709 self.value = value 710 self.unit = unit 711 self.desc = desc 712 self.minimum = minimum 713 self.maximum = maximum
714
715 - def __str__(self):
716 """Show the class name and the human-readable date.""" 717 if self.unit and self.value is not None: 718 return '%s %s' % (self.value, self.unit) 719 if self.desc is not None: 720 return self.desc 721 return self.__class__.__name__
722
723 724 -class Distribution(PhyloElement):
725 """Geographic distribution of the items of a clade (species, sequences). 726 727 Intended for phylogeographic applications. 728 729 :Parameters: 730 desc : string 731 free-text description of the location 732 points : list of `Point` objects 733 coordinates (similar to the 'Point' element in Google's KML format) 734 polygons : list of `Polygon` objects 735 coordinate sets defining geographic regions 736 """
737 - def __init__(self, desc=None, points=None, polygons=None):
738 self.desc = desc 739 self.points = points or [] 740 self.polygons = polygons or []
741
742 743 -class DomainArchitecture(PhyloElement):
744 """Domain architecture of a protein. 745 746 :Parameters: 747 length : int 748 total length of the protein sequence 749 domains : list ProteinDomain objects 750 the domains within this protein 751 """
752 - def __init__(self, length=None, domains=None):
753 self.length = length 754 self.domains = domains
755
756 757 -class Events(PhyloElement):
758 """Events at the root node of a clade (e.g. one gene duplication). 759 760 All attributes are set to None by default, but this object can also be 761 treated as a dictionary, in which case None values are treated as missing 762 keys and deleting a key resets that attribute's value back to None. 763 """ 764 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 765 'mixed', 'unassigned')) 766
767 - def __init__(self, type=None, duplications=None, speciations=None, 768 losses=None, confidence=None):
769 _check_str(type, self.ok_type.__contains__) 770 self.type = type 771 self.duplications = duplications 772 self.speciations = speciations 773 self.losses = losses 774 self.confidence = confidence
775
776 - def items(self):
777 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
778
779 - def keys(self):
780 return [k for k, v in self.__dict__.iteritems() if v is not None]
781
782 - def values(self):
783 return [v for v in self.__dict__.itervalues() if v is not None]
784
785 - def __len__(self):
786 return len(self.values())
787
788 - def __getitem__(self, key):
789 if not hasattr(self, key): 790 raise KeyError(key) 791 val = getattr(self, key) 792 if val is None: 793 raise KeyError("%s has not been set in this object" % repr(key)) 794 return val
795
796 - def __setitem__(self, key, val):
797 setattr(self, key, val)
798
799 - def __delitem__(self, key):
800 setattr(self, key, None)
801
802 - def __iter__(self):
803 return iter(self.keys())
804
805 - def __contains__(self, key):
806 return (hasattr(self, key) and getattr(self, key) is not None)
807
808 809 -class Id(PhyloElement):
810 """A general-purpose identifier element. 811 812 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 813 along with the value itself. 814 """
815 - def __init__(self, value, provider=None):
816 self.value = value 817 self.provider = provider
818
819 - def __str__(self):
820 if self.provider is not None: 821 return '%s:%s' % (self.provider, self.value) 822 return self.value
823
824 825 -class MolSeq(PhyloElement):
826 """Store a molecular sequence. 827 828 :Parameters: 829 value : string 830 the sequence itself 831 is_aligned : bool 832 True if this sequence is aligned with the others (usually meaning 833 all aligned seqs are the same length and gaps may be present) 834 """ 835 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 836
837 - def __init__(self, value, is_aligned=None):
838 _check_str(value, self.re_value.match) 839 self.value = value 840 self.is_aligned = is_aligned
841
842 - def __str__(self):
843 return self.value
844
845 846 -class Point(PhyloElement):
847 """Geographic coordinates of a point, with an optional altitude. 848 849 Used by element 'Distribution'. 850 851 :Parameters: 852 geodetic_datum : string, required 853 the geodetic datum (also called 'map datum'). For example, Google's 854 KML uses 'WGS84'. 855 lat : numeric 856 latitude 857 long : numeric 858 longitude 859 alt : numeric 860 altitude 861 alt_unit : string 862 unit for the altitude (e.g. 'meter') 863 """
864 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
865 self.geodetic_datum = geodetic_datum 866 self.lat = lat 867 self.long = long 868 self.alt = alt 869 self.alt_unit = alt_unit
870
871 872 -class Polygon(PhyloElement):
873 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 874 875 :param points: list of 3 or more points representing vertices. 876 """
877 - def __init__(self, points=None):
878 self.points = points or []
879
880 - def __str__(self):
881 return '%s([%s])' % (self.__class__.__name__, 882 ',\n'.join(map(str, self.points)))
883
884 885 -class Property(PhyloElement):
886 """A typed and referenced property from an external resources. 887 888 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 889 890 :Parameters: 891 value : string 892 the value of the property 893 ref : string 894 reference to an external resource, e.g. "NOAA:depth" 895 applies_to : string 896 indicates the item to which a property applies to (e.g. 'node' for 897 the parent node of a clade, 'parent_branch' for the parent branch of 898 a clade, or just 'clade'). 899 datatype : string 900 the type of a property; limited to xsd-datatypes 901 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 902 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 903 unit : string (optional) 904 the unit of the property, e.g. "METRIC:m" 905 id_ref : Id (optional) 906 allows to attached a property specifically to one element (on the 907 xml-level) 908 """ 909 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 910 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 911 'parent_branch', 'other')) 912 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 913 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 914 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 915 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 916 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 917 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 918 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 919 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 920 'xsd:positiveInteger')) 921
922 - def __init__(self, value, ref, applies_to, datatype, 923 unit=None, id_ref=None):
924 _check_str(ref, self.re_ref.match) 925 _check_str(applies_to, self.ok_applies_to.__contains__) 926 _check_str(datatype, self.ok_datatype.__contains__) 927 _check_str(unit, self.re_ref.match) 928 self.unit = unit 929 self.id_ref = id_ref 930 self.value = value 931 self.ref = ref 932 self.applies_to = applies_to 933 self.datatype = datatype
934
935 936 -class ProteinDomain(PhyloElement):
937 """Represents an individual domain in a domain architecture. 938 939 The locations use 0-based indexing, as most Python objects including 940 SeqFeature do, rather than the usual biological convention starting at 1. 941 This means the start and end attributes can be used directly as slice 942 indexes on Seq objects. 943 944 :Parameters: 945 start : non-negative integer 946 start of the domain on the sequence, using 0-based indexing 947 end : non-negative integer 948 end of the domain on the sequence 949 confidence : float 950 can be used to store e.g. E-values 951 id : string 952 unique identifier/name 953 """ 954
955 - def __init__(self, value, start, end, confidence=None, id=None):
956 self.value = value 957 self.start = start 958 self.end = end 959 self.confidence = confidence 960 self.id = id
961 962 @classmethod
963 - def from_seqfeature(cls, feat):
964 return ProteinDomain(feat.id, 965 feat.location.nofuzzy_start, 966 feat.location.nofuzzy_end, 967 confidence=feat.qualifiers.get('confidence'))
968
969 - def to_seqfeature(self):
970 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 971 id=self.value) 972 if hasattr(self, 'confidence'): 973 feat.qualifiers['confidence'] = self.confidence 974 return feat
975
976 977 -class Reference(PhyloElement):
978 """Literature reference for a clade. 979 980 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 981 ``desc`` element. 982 """ 983 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 984
985 - def __init__(self, doi=None, desc=None):
986 _check_str(doi, self.re_doi.match) 987 self.doi = doi 988 self.desc = desc
989
990 991 -class Sequence(PhyloElement):
992 """A molecular sequence (Protein, DNA, RNA) associated with a node. 993 994 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 995 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 996 node. 997 998 :Parameters: 999 type : {'dna', 'rna', 'protein'} 1000 type of molecule this sequence represents 1001 id_ref : string 1002 reference to another resource 1003 id_source : string 1004 source for the reference 1005 symbol : string 1006 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 1007 accession : Accession 1008 accession code for this sequence. 1009 name : string 1010 full name of the sequence, e.g. 'muscle Actin' 1011 location 1012 location of a sequence on a genome/chromosome. 1013 mol_seq : MolSeq 1014 the molecular sequence itself 1015 uri : Uri 1016 link 1017 annotations : list of Annotation objects 1018 annotations on this sequence 1019 domain_architecture : DomainArchitecture 1020 protein domains on this sequence 1021 other : list of Other objects 1022 non-phyloXML elements 1023 """ 1024 alphabets = {'dna': Alphabet.generic_dna, 1025 'rna': Alphabet.generic_rna, 1026 'protein': Alphabet.generic_protein} 1027 re_symbol = re.compile(r'\S{1,10}') 1028
1029 - def __init__(self, 1030 # Attributes 1031 type=None, id_ref=None, id_source=None, 1032 # Child nodes 1033 symbol=None, accession=None, name=None, location=None, 1034 mol_seq=None, uri=None, domain_architecture=None, 1035 # Collections 1036 annotations=None, other=None, 1037 ):
1038 _check_str(type, self.alphabets.__contains__) 1039 _check_str(symbol, self.re_symbol.match) 1040 self.type = type 1041 self.id_ref = id_ref 1042 self.id_source = id_source 1043 self.symbol = symbol 1044 self.accession = accession 1045 self.name = name 1046 self.location = location 1047 self.mol_seq = mol_seq 1048 self.uri = uri 1049 self.domain_architecture = domain_architecture 1050 self.annotations = annotations or [] 1051 self.other = other or []
1052 1053 @classmethod
1054 - def from_seqrecord(cls, record, is_aligned=None):
1055 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1056 if is_aligned == None: 1057 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1058 params = { 1059 'accession': Accession(record.id, ''), 1060 'symbol': record.name, 1061 'name': record.description, 1062 'mol_seq': MolSeq(str(record.seq), is_aligned), 1063 } 1064 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1065 params['type'] = 'dna' 1066 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1067 params['type'] = 'rna' 1068 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1069 params['type'] = 'protein' 1070 1071 # Unpack record.annotations 1072 for key in ('id_ref', 'id_source', 'location'): 1073 if key in record.annotations: 1074 params[key] = record.annotations[key] 1075 if isinstance(record.annotations.get('uri'), dict): 1076 params['uri'] = Uri(**record.annotations['uri']) 1077 # Build a Sequence.annotation object 1078 if record.annotations.get('annotations'): 1079 params['annotations'] = [] 1080 for annot in record.annotations['annotations']: 1081 ann_args = {} 1082 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1083 if key in annot: 1084 ann_args[key] = annot[key] 1085 if isinstance(annot.get('confidence'), list): 1086 ann_args['confidence'] = Confidence( 1087 *annot['confidence']) 1088 if isinstance(annot.get('properties'), list): 1089 ann_args['properties'] = [Property(**prop) 1090 for prop in annot['properties'] 1091 if isinstance(prop, dict)] 1092 params['annotations'].append(Annotation(**ann_args)) 1093 1094 # Unpack record.features 1095 if record.features: 1096 params['domain_architecture'] = DomainArchitecture( 1097 length=len(record.seq), 1098 domains=[ProteinDomain.from_seqfeature(feat) 1099 for feat in record.features]) 1100 1101 return Sequence(**params)
1102
1103 - def to_seqrecord(self):
1104 """Create a SeqRecord object from this Sequence instance. 1105 1106 The seqrecord.annotations dictionary is packed like so:: 1107 1108 { # Sequence attributes with no SeqRecord equivalent: 1109 'id_ref': self.id_ref, 1110 'id_source': self.id_source, 1111 'location': self.location, 1112 'uri': { 'value': self.uri.value, 1113 'desc': self.uri.desc, 1114 'type': self.uri.type }, 1115 # Sequence.annotations attribute (list of Annotations) 1116 'annotations': [{ 'ref': ann.ref, 1117 'source': ann.source, 1118 'evidence': ann.evidence, 1119 'type': ann.type, 1120 'confidence': [ ann.confidence.value, 1121 ann.confidence.type ], 1122 'properties': [{ 'value': prop.value, 1123 'ref': prop.ref, 1124 'applies_to': prop.applies_to, 1125 'datatype': prop.datatype, 1126 'unit': prop.unit, 1127 'id_ref': prop.id_ref } 1128 for prop in ann.properties], 1129 } for ann in self.annotations], 1130 } 1131 """ 1132 def clean_dict(dct): 1133 """Remove None-valued items from a dictionary.""" 1134 return dict((key, val) for key, val in dct.iteritems() 1135 if val is not None)
1136 1137 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1138 **clean_dict({ 1139 'id': str(self.accession), 1140 'name': self.symbol, 1141 'description': self.name, 1142 # 'dbxrefs': None, 1143 })) 1144 if self.domain_architecture: 1145 seqrec.features = [dom.to_seqfeature() 1146 for dom in self.domain_architecture.domains] 1147 # Sequence attributes with no SeqRecord equivalent 1148 seqrec.annotations = clean_dict({ 1149 'id_ref': self.id_ref, 1150 'id_source': self.id_source, 1151 'location': self.location, 1152 'uri': self.uri and clean_dict({ 1153 'value': self.uri.value, 1154 'desc': self.uri.desc, 1155 'type': self.uri.type, 1156 }), 1157 'annotations': self.annotations and [ 1158 clean_dict({ 1159 'ref': ann.ref, 1160 'source': ann.source, 1161 'evidence': ann.evidence, 1162 'type': ann.type, 1163 'confidence': ann.confidence and [ 1164 ann.confidence.value, 1165 ann.confidence.type], 1166 'properties': [clean_dict({ 1167 'value': prop.value, 1168 'ref': prop.ref, 1169 'applies_to': prop.applies_to, 1170 'datatype': prop.datatype, 1171 'unit': prop.unit, 1172 'id_ref': prop.id_ref }) 1173 for prop in ann.properties], 1174 }) for ann in self.annotations], 1175 }) 1176 return seqrec
1177
1178 - def get_alphabet(self):
1179 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1180 if self.mol_seq and self.mol_seq.is_aligned: 1181 return Alphabet.Gapped(alph) 1182 return alph
1183
1184 1185 -class SequenceRelation(PhyloElement):
1186 """Express a typed relationship between two sequences. 1187 1188 For example, this could be used to describe an orthology (in which case 1189 attribute 'type' is 'orthology'). 1190 1191 :Parameters: 1192 id_ref_0 : Id 1193 first sequence reference identifier 1194 id_ref_1 : Id 1195 second sequence reference identifier 1196 distance : float 1197 distance between the two sequences 1198 type : restricted string 1199 describe the type of relationship 1200 confidence : Confidence 1201 confidence value for this relation 1202 """ 1203 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1204 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1205
1206 - def __init__(self, type, id_ref_0, id_ref_1, 1207 distance=None, confidence=None):
1208 _check_str(type, self.ok_type.__contains__) 1209 self.distance = distance 1210 self.type = type 1211 self.id_ref_0 = id_ref_0 1212 self.id_ref_1 = id_ref_1 1213 self.confidence = confidence
1214
1215 1216 -class Taxonomy(PhyloElement):
1217 """Describe taxonomic information for a clade. 1218 1219 :Parameters: 1220 id_source : Id 1221 link other elements to a taxonomy (on the XML level) 1222 id : Id 1223 unique identifier of a taxon, e.g. Id('6500', 1224 provider='ncbi_taxonomy') for the California sea hare 1225 code : restricted string 1226 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1227 California sea hare 'Aplysia californica' 1228 scientific_name : string 1229 the standard scientific name for this organism, e.g. 'Aplysia 1230 californica' for the California sea hare 1231 authority : string 1232 keep the authority, such as 'J. G. Cooper, 1863', associated with 1233 the 'scientific_name' 1234 common_names : list of strings 1235 common names for this organism 1236 synonyms : list of strings 1237 synonyms for this taxon? 1238 rank : restricted string 1239 taxonomic rank 1240 uri : Uri 1241 link 1242 other : list of Other objects 1243 non-phyloXML elements 1244 """ 1245 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1246 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1247 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1248 'superdivision', 'division', 'subdivision', 'infradivision', 1249 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1250 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1251 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1252 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1253 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1254 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1255 'unknown', 'other')) 1256
1257 - def __init__(self, 1258 # Attributes 1259 id_source=None, 1260 # Child nodes 1261 id=None, code=None, scientific_name=None, authority=None, 1262 rank=None, uri=None, 1263 # Collections 1264 common_names=None, synonyms=None, other=None, 1265 ):
1266 _check_str(code, self.re_code.match) 1267 _check_str(rank, self.ok_rank.__contains__) 1268 self.id_source = id_source 1269 self.id = id 1270 self.code = code 1271 self.scientific_name = scientific_name 1272 self.authority = authority 1273 self.rank = rank 1274 self.uri = uri 1275 self.common_names = common_names or [] 1276 self.synonyms = synonyms or [] 1277 self.other = other or []
1278
1279 - def __str__(self):
1280 """Show the class name and an identifying attribute.""" 1281 if self.code is not None: 1282 return self.code 1283 if self.scientific_name is not None: 1284 return self.scientific_name 1285 if self.rank is not None: 1286 return self.rank 1287 if self.id is not None: 1288 return str(self.id) 1289 return self.__class__.__name__
1290
1291 1292 -class Uri(PhyloElement):
1293 """A uniform resource identifier. 1294 1295 In general, this is expected to be an URL (for example, to link to an image 1296 on a website, in which case the ``type`` attribute might be 'image' and 1297 ``desc`` might be 'image of a California sea hare'). 1298 """
1299 - def __init__(self, value, desc=None, type=None):
1300 self.value = value 1301 self.desc = desc 1302 self.type = type
1303
1304 - def __str__(self):
1305 if self.value: 1306 return self.value 1307 return repr(self)
1308