1
2
3
4
5
6 """Classes corresponding to phyloXML elements.
7
8 See Also
9 --------
10 Official specification:
11 http://phyloxml.org/
12 Journal article:
13 Han and Zmasek (2009), doi:10.1186/1471-2105-10-356
14 """
15 __docformat__ = "restructuredtext en"
16
17 import re
18 import warnings
19
20 from Bio import Alphabet
21 from Bio.Align import MultipleSeqAlignment
22 from Bio.Seq import Seq
23 from Bio.SeqFeature import SeqFeature, FeatureLocation
24 from Bio.SeqRecord import SeqRecord
25 from Bio import BiopythonWarning
26
27 from Bio.Phylo import BaseTree
28
29
30 try:
31 any
32 except NameError:
34 for element in iterable:
35 if element:
36 return True
37 return False
38
40 """Warning for non-compliance with the phyloXML specification."""
41 pass
42
45 """Check a string using testfunc, and warn if there's no match."""
46 if text is not None and not testfunc(text):
47 warnings.warn("String %s doesn't match the given regexp" % text,
48 PhyloXMLWarning, stacklevel=2)
49
54 """Base class for all PhyloXML objects."""
55
58 """Root node of the PhyloXML document.
59
60 Contains an arbitrary number of Phylogeny elements, possibly followed by
61 elements from other namespaces.
62
63 :Parameters:
64 attributes
65 (XML namespace definitions)
66 phylogenies
67 list of phylogenetic trees
68 other
69 list of arbitrary non-phyloXML elements, if any
70 """
71 - def __init__(self, attributes, phylogenies=None, other=None):
72 self.attributes = attributes
73 self.phylogenies = phylogenies or []
74 self.other = other or []
75
77 """Get a phylogeny by index or name."""
78 if isinstance(index, int) or isinstance(index, slice):
79 return self.phylogenies[index]
80 if not isinstance(index, basestring):
81 raise KeyError("can't use %s as an index" % type(index))
82 for tree in self.phylogenies:
83 if tree.name == index:
84 return tree
85 else:
86 raise KeyError("no phylogeny found with name " + repr(index))
87
89 """Iterate through the phylogenetic trees in this object."""
90 return iter(self.phylogenies)
91
93 """Number of phylogenetic trees in this object."""
94 return len(self.phylogenies)
95
97 return '%s([%s])' % (self.__class__.__name__,
98 ',\n'.join(map(str, self.phylogenies)))
99
100
101 -class Other(PhyloElement):
102 """Container for non-phyloXML elements in the tree.
103
104 Usually, an Other object will have either a 'value' or a non-empty list
105 of 'children', but not both. This is not enforced here, though.
106
107 :Parameters:
108 tag : string
109 local tag for the XML node
110 namespace : string
111 XML namespace for the node -- should not be the default phyloXML
112 namespace.
113 attributes : dict of strings
114 attributes on the XML node
115 value : string
116 text contained directly within this XML node
117 children : list
118 child nodes, if any (also `Other` instances)
119 """
120 - def __init__(self, tag, namespace=None, attributes=None, value=None,
121 children=None):
122 self.tag = tag
123 self.namespace = namespace
124 self.attributes = attributes
125 self.value = value
126 self.children = children or []
127
129 """Iterate through the children of this object (if any)."""
130 return iter(self.children)
131
132
133 -class Phylogeny(PhyloElement, BaseTree.Tree):
134 """A phylogenetic tree.
135
136 :Parameters:
137 root : Clade
138 the root node/clade of this tree
139 rooted : bool
140 True if this tree is rooted
141 rerootable : bool
142 True if this tree is rerootable
143 branch_length_unit : string
144 unit for branch_length values on clades
145 name : string
146 identifier for this tree, not required to be unique
147 id : Id
148 unique identifier for this tree
149 description : string
150 plain-text description
151 date : Date
152 date for the root node of this tree
153 confidences : list
154 Confidence objects for this tree
155 clade_relations : list
156 CladeRelation objects
157 sequence_relations : list
158 SequenceRelation objects
159 properties : list
160 Property objects
161 other : list
162 non-phyloXML elements (type `Other`)
163 """
164 - def __init__(self, root=None, rooted=True,
165 rerootable=None, branch_length_unit=None, type=None,
166
167 name=None, id=None, description=None, date=None,
168
169 confidences=None, clade_relations=None, sequence_relations=None,
170 properties=None, other=None,
171 ):
172 assert isinstance(rooted, bool)
173 self.root = root
174 self.rooted = rooted
175 self.rerootable = rerootable
176 self.branch_length_unit = branch_length_unit
177 self.type = type
178 self.name = name
179 self.id = id
180 self.description = description
181 self.date = date
182 self.confidences = confidences or []
183 self.clade_relations = clade_relations or []
184 self.sequence_relations = sequence_relations or []
185 self.properties = properties or []
186 self.other = other or []
187
188 @classmethod
190 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree).
191
192 Keyword arguments are the usual `Phylogeny` constructor parameters.
193 """
194 phy = cls(
195 root=Clade.from_clade(tree.root),
196 rooted=tree.rooted,
197 name=tree.name,
198 id=(tree.id is not None) and Id(str(tree.id)) or None)
199 phy.__dict__.update(kwargs)
200 return phy
201
202 @classmethod
204 """Create a new Phylogeny given a Newick or BaseTree Clade object.
205
206 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters.
207 """
208 return Clade.from_clade(clade).to_phylogeny(**kwargs)
209
211 """Return this tree, a PhyloXML-compatible Phylogeny object.
212
213 Overrides the `BaseTree` method.
214 """
215 return self
216
218 """Create a new Phyloxml object containing just this phylogeny."""
219 return Phyloxml(kwargs, phylogenies=[self])
220
222 """Construct an alignment from the aligned sequences in this tree."""
223 def is_aligned_seq(elem):
224 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
225 return True
226 return False
227 seqs = self._filter_search(is_aligned_seq, 'preorder', True)
228 try:
229 first_seq = seqs.next()
230 except StopIteration:
231
232 return MultipleSeqAlignment([])
233 msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
234 first_seq.get_alphabet())
235 msa.extend(seq.to_seqrecord() for seq in seqs)
236 return msa
237
238
240 """Equivalent to self.confidences[0] if there is only 1 value.
241
242 See also: `Clade.confidence`, `Clade.taxonomy`
243 """
244 if len(self.confidences) == 0:
245 return None
246 if len(self.confidences) > 1:
247 raise AttributeError("more than 1 confidence value available; "
248 "use Phylogeny.confidences")
249 return self.confidences[0]
250
252 if value is None:
253
254 self.confidences = []
255 return
256 if isinstance(value, float) or isinstance(value, int):
257 value = Confidence(value)
258 elif not isinstance(value, Confidence):
259 raise ValueError("value must be a number or Confidence instance")
260 if len(self.confidences) == 0:
261 self.confidences.append(value)
262 elif len(self.confidences) == 1:
263 self.confidences[0] = value
264 else:
265 raise ValueError("multiple confidence values already exist; "
266 "use Phylogeny.confidences instead")
267
269 self.confidences = []
270
271 confidence = property(_get_confidence, _set_confidence, _del_confidence)
272
273
274 -class Clade(PhyloElement, BaseTree.Clade):
275 """Describes a branch of the current phylogenetic tree.
276
277 Used recursively, describes the topology of a phylogenetic tree.
278
279 Both ``color`` and ``width`` elements should be interpreted by client code
280 as applying to the whole clade, including all descendents, unless
281 overwritten in-sub clades. This module doesn't automatically assign these
282 attributes to sub-clades to achieve this cascade -- and neither should you.
283
284 :Parameters:
285 branch_length
286 parent branch length of this clade
287 id_source
288 link other elements to a clade (on the xml-level)
289 name : string
290 short label for this clade
291 confidences : list of Confidence objects
292 used to indicate the support for a clade/parent branch.
293 width : float
294 branch width for this clade (including branch from parent)
295 color : BranchColor
296 color used for graphical display of this clade
297 node_id
298 unique identifier for the root node of this clade
299 taxonomies : list
300 Taxonomy objects
301 sequences : list
302 Sequence objects
303 events : Events
304 describe such events as gene-duplications at the root node/parent
305 branch of this clade
306 binary_characters : BinaryCharacters
307 binary characters
308 distributions : list of Distribution objects
309 distribution(s) of this clade
310 date : Date
311 a date for the root node of this clade
312 references : list
313 Reference objects
314 properties : list
315 Property objects
316 clades : list Clade objects
317 Sub-clades
318 other : list of Other objects
319 non-phyloXML objects
320 """
321 - def __init__(self,
322
323 branch_length=None, id_source=None,
324
325 name=None, width=None, color=None, node_id=None, events=None,
326 binary_characters=None, date=None,
327
328 confidences=None, taxonomies=None, sequences=None,
329 distributions=None, references=None, properties=None, clades=None,
330 other=None,
331 ):
349
350 @classmethod
362
364 """Create a new phylogeny containing just this clade."""
365 phy = Phylogeny(root=self, date=self.date)
366 phy.__dict__.update(kwargs)
367 return phy
368
369
370
372 if len(self.confidences) == 0:
373 return None
374 if len(self.confidences) > 1:
375 raise AttributeError("more than 1 confidence value available; "
376 "use Clade.confidences")
377 return self.confidences[0]
378
380 if value is None:
381
382 self.confidences = []
383 return
384 if isinstance(value, float) or isinstance(value, int):
385 value = Confidence(value)
386 elif not isinstance(value, Confidence):
387 raise ValueError("value must be a number or Confidence instance")
388 if len(self.confidences) == 0:
389 self.confidences.append(value)
390 elif len(self.confidences) == 1:
391 self.confidences[0] = value
392 else:
393 raise ValueError("multiple confidence values already exist; "
394 "use Phylogeny.confidences instead")
395
397 self.confidences = []
398
399 confidence = property(_get_confidence, _set_confidence, _del_confidence)
400
402 if len(self.taxonomies) == 0:
403 return None
404 if len(self.taxonomies) > 1:
405 raise AttributeError("more than 1 taxonomy value available; "
406 "use Clade.taxonomies")
407 return self.taxonomies[0]
408
410 if not isinstance(value, Taxonomy):
411 raise ValueError("assigned value must be a Taxonomy instance")
412 if len(self.taxonomies) == 0:
413 self.taxonomies.append(value)
414 elif len(self.taxonomies) == 1:
415 self.taxonomies[0] = value
416 else:
417 raise ValueError("multiple taxonomy values already exist; "
418 "use Phylogeny.taxonomies instead")
419
420 taxonomy = property(_get_taxonomy, _set_taxonomy)
421
422
425
427 if arg is None or isinstance(arg, BranchColor):
428 self._color = arg
429 elif isinstance(arg, basestring):
430 if arg in BranchColor.color_names:
431
432 self._color = BranchColor.from_name(arg)
433 elif arg.startswith('#') and len(arg) == 7:
434
435 self._color = BranchColor.from_hex(arg)
436 else:
437 raise ValueError("invalid color string %s" % arg)
438 elif hasattr(arg, '__iter__') and len(arg) == 3:
439
440 self._color = BranchColor(*arg)
441 else:
442 raise ValueError("invalid color value %s" % arg)
443
444 color = property(_get_color, _set_color, doc="Branch color.")
445
450 """Captures the local part in a sequence identifier.
451
452 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value``
453 is 'P17304' and the ``source`` attribute is 'UniProtKB'.
454 """
458
460 """Show the class name and an identifying attribute."""
461 return '%s:%s' % (self.source, self.value)
462
465 """The annotation of a molecular sequence.
466
467 It is recommended to annotate by using the optional 'ref' attribute.
468
469 :Parameters:
470 ref : string
471 reference string, e.g. 'GO:0008270',
472 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1'
473 source : string
474 plain-text source for this annotation
475 evidence : str
476 describe evidence as free text (e.g. 'experimental')
477 desc : string
478 free text description
479 confidence : Confidence
480 state the type and value of support (type Confidence)
481 properties : list
482 typed and referenced annotations from external resources
483 uri : Uri
484 link
485 """
486 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
487
488 - def __init__(self,
489
490 ref=None, source=None, evidence=None, type=None,
491
492 desc=None, confidence=None, uri=None,
493
494 properties=None):
504
507 """The names and/or counts of binary characters present, gained, and lost
508 at the root of a clade.
509 """
510 - def __init__(self,
511
512 type=None, gained_count=None, lost_count=None, present_count=None,
513 absent_count=None,
514
515 gained=None, lost=None, present=None, absent=None):
516 self.type=type
517 self.gained_count=gained_count
518 self.lost_count=lost_count
519 self.present_count=present_count
520 self.absent_count=absent_count
521 self.gained=gained or []
522 self.lost=lost or []
523 self.present=present or []
524 self.absent=absent or []
525
528 """Indicates the color of a clade when rendered graphically.
529
530 The color should be interpreted by client code (e.g. visualization
531 programs) as applying to the whole clade, unless overwritten by the
532 color(s) of sub-clades.
533
534 Color values must be integers from 0 to 255.
535 """
536
537 color_names = {
538 'red': (255, 0, 0),
539 'r': (255, 0, 0),
540 'yellow': (255, 255, 0),
541 'y': (255, 255, 0),
542 'green': ( 0, 128, 0),
543 'g': ( 0, 128, 0),
544 'cyan': ( 0, 255, 255),
545 'c': ( 0, 255, 255),
546 'blue': ( 0, 0, 255),
547 'b': ( 0, 0, 255),
548 'magenta': (255, 0, 255),
549 'm': (255, 0, 255),
550 'black': ( 0, 0, 0),
551 'k': ( 0, 0, 0),
552 'white': (255, 255, 255),
553 'w': (255, 255, 255),
554
555
556 'maroon': (128, 0, 0),
557 'olive': (128, 128, 0),
558 'lime': ( 0, 255, 0),
559 'aqua': ( 0, 255, 255),
560 'teal': ( 0, 128, 128),
561 'navy': ( 0, 0, 128),
562 'fuchsia': (255, 0, 255),
563 'purple': (128, 0, 128),
564 'silver': (192, 192, 192),
565 'gray': (128, 128, 128),
566
567 'grey': (128, 128, 128),
568 'pink': (255, 192, 203),
569 'salmon': (250, 128, 114),
570 'orange': (255, 165, 0),
571 'gold': (255, 215, 0),
572 'tan': (210, 180, 140),
573 'brown': (165, 42, 42),
574 }
575
584
585 @classmethod
587 """Construct a BranchColor object from a hexadecimal string.
588
589 The string format is the same style used in HTML and CSS, such as
590 '#FF8000' for an RGB value of (255, 128, 0).
591 """
592 assert (isinstance(hexstr, basestring) and
593 hexstr.startswith('#') and
594 len(hexstr) == 7
595 ), "need a 24-bit hexadecimal string, e.g. #000000"
596 def unpack(cc):
597 return int('0x'+cc, base=16)
598 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:]
599 return cls(*map(unpack, RGB))
600
601 @classmethod
603 """Construct a BranchColor object by the color's name."""
604 return cls(*cls.color_names[colorname])
605
607 """Return a 24-bit hexadecimal RGB representation of this color.
608
609 The returned string is suitable for use in HTML/CSS, as a color
610 parameter in matplotlib, and perhaps other situations.
611
612 Example:
613
614 >>> bc = BranchColor(12, 200, 100)
615 >>> bc.to_hex()
616 '#0cc864'
617 """
618 return '#' + hex(
619 self.red * (16**4)
620 + self.green * (16**2)
621 + self.blue)[2:].zfill(6)
622
624 """Return a tuple of RGB values (0 to 255) representing this color.
625
626 Example:
627
628 >>> bc = BranchColor(255, 165, 0)
629 >>> bc.to_rgb()
630 (255, 165, 0)
631 """
632 return (self.red, self.green, self.blue)
633
635 """Preserve the standard RGB order when representing this object."""
636 return (u'%s(red=%d, green=%d, blue=%d)'
637 % (self.__class__.__name__, self.red, self.green, self.blue))
638
640 """Show the color's RGB values."""
641 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
642
645 """Expresses a typed relationship between two clades.
646
647 For example, this could be used to describe multiple parents of a clade.
648
649 @type id_ref_0: str
650 @type id_ref_1: str
651 @type distance: str
652 @type type: str
653
654 @type confidence: Confidence
655 """
656 - def __init__(self, type, id_ref_0, id_ref_1,
657 distance=None, confidence=None):
663
666 """A general purpose confidence element.
667
668 For example, this can be used to express the bootstrap support value of a
669 clade (in which case the `type` attribute is 'bootstrap').
670
671 :Parameters:
672 value : float
673 confidence value
674 type : string
675 label for the type of confidence, e.g. 'bootstrap'
676 """
677 - def __init__(self, value, type='unknown'):
680
682 return float(self.value)
683
685 return int(self.value)
686
687
688 -class Date(PhyloElement):
689 """A date associated with a clade/node.
690
691 Its value can be numerical by using the 'value' element and/or free text
692 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it
693 is recommended to employ the 'unit' attribute.
694
695 :Parameters:
696 unit : string
697 type of numerical value (e.g. 'mya' for 'million years ago')
698 value : float
699 the date value
700 desc : string
701 plain-text description of the date
702 minimum : float
703 lower bound on the date value
704 maximum : float
705 upper bound on the date value
706 """
707 - def __init__(self, value=None, unit=None, desc=None,
708 minimum=None, maximum=None):
714
716 """Show the class name and the human-readable date."""
717 if self.unit and self.value is not None:
718 return '%s %s' % (self.value, self.unit)
719 if self.desc is not None:
720 return self.desc
721 return self.__class__.__name__
722
725 """Geographic distribution of the items of a clade (species, sequences).
726
727 Intended for phylogeographic applications.
728
729 :Parameters:
730 desc : string
731 free-text description of the location
732 points : list of `Point` objects
733 coordinates (similar to the 'Point' element in Google's KML format)
734 polygons : list of `Polygon` objects
735 coordinate sets defining geographic regions
736 """
737 - def __init__(self, desc=None, points=None, polygons=None):
738 self.desc = desc
739 self.points = points or []
740 self.polygons = polygons or []
741
742
743 -class DomainArchitecture(PhyloElement):
744 """Domain architecture of a protein.
745
746 :Parameters:
747 length : int
748 total length of the protein sequence
749 domains : list ProteinDomain objects
750 the domains within this protein
751 """
752 - def __init__(self, length=None, domains=None):
753 self.length = length
754 self.domains = domains
755
756
757 -class Events(PhyloElement):
758 """Events at the root node of a clade (e.g. one gene duplication).
759
760 All attributes are set to None by default, but this object can also be
761 treated as a dictionary, in which case None values are treated as missing
762 keys and deleting a key resets that attribute's value back to None.
763 """
764 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other',
765 'mixed', 'unassigned'))
766
767 - def __init__(self, type=None, duplications=None, speciations=None,
768 losses=None, confidence=None):
775
777 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
778
780 return [k for k, v in self.__dict__.iteritems() if v is not None]
781
783 return [v for v in self.__dict__.itervalues() if v is not None]
784
787
789 if not hasattr(self, key):
790 raise KeyError(key)
791 val = getattr(self, key)
792 if val is None:
793 raise KeyError("%s has not been set in this object" % repr(key))
794 return val
795
797 setattr(self, key, val)
798
800 setattr(self, key, None)
801
803 return iter(self.keys())
804
806 return (hasattr(self, key) and getattr(self, key) is not None)
807
808
809 -class Id(PhyloElement):
810 """A general-purpose identifier element.
811
812 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI,
813 along with the value itself.
814 """
815 - def __init__(self, value, provider=None):
816 self.value = value
817 self.provider = provider
818
820 if self.provider is not None:
821 return '%s:%s' % (self.provider, self.value)
822 return self.value
823
824
825 -class MolSeq(PhyloElement):
826 """Store a molecular sequence.
827
828 :Parameters:
829 value : string
830 the sequence itself
831 is_aligned : bool
832 True if this sequence is aligned with the others (usually meaning
833 all aligned seqs are the same length and gaps may be present)
834 """
835 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+')
836
837 - def __init__(self, value, is_aligned=None):
841
844
845
846 -class Point(PhyloElement):
847 """Geographic coordinates of a point, with an optional altitude.
848
849 Used by element 'Distribution'.
850
851 :Parameters:
852 geodetic_datum : string, required
853 the geodetic datum (also called 'map datum'). For example, Google's
854 KML uses 'WGS84'.
855 lat : numeric
856 latitude
857 long : numeric
858 longitude
859 alt : numeric
860 altitude
861 alt_unit : string
862 unit for the altitude (e.g. 'meter')
863 """
864 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
865 self.geodetic_datum = geodetic_datum
866 self.lat = lat
867 self.long = long
868 self.alt = alt
869 self.alt_unit = alt_unit
870
873 """A polygon defined by a list of 'Points' (used by element 'Distribution').
874
875 :param points: list of 3 or more points representing vertices.
876 """
878 self.points = points or []
879
881 return '%s([%s])' % (self.__class__.__name__,
882 ',\n'.join(map(str, self.points)))
883
886 """A typed and referenced property from an external resources.
887
888 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects.
889
890 :Parameters:
891 value : string
892 the value of the property
893 ref : string
894 reference to an external resource, e.g. "NOAA:depth"
895 applies_to : string
896 indicates the item to which a property applies to (e.g. 'node' for
897 the parent node of a clade, 'parent_branch' for the parent branch of
898 a clade, or just 'clade').
899 datatype : string
900 the type of a property; limited to xsd-datatypes
901 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal',
902 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI').
903 unit : string (optional)
904 the unit of the property, e.g. "METRIC:m"
905 id_ref : Id (optional)
906 allows to attached a property specifically to one element (on the
907 xml-level)
908 """
909 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
910 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation',
911 'parent_branch', 'other'))
912 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float',
913 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date',
914 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay',
915 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI',
916 'xsd:normalizedString', 'xsd:token', 'xsd:integer',
917 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int',
918 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong',
919 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte',
920 'xsd:positiveInteger'))
921
922 - def __init__(self, value, ref, applies_to, datatype,
923 unit=None, id_ref=None):
934
935
936 -class ProteinDomain(PhyloElement):
937 """Represents an individual domain in a domain architecture.
938
939 The locations use 0-based indexing, as most Python objects including
940 SeqFeature do, rather than the usual biological convention starting at 1.
941 This means the start and end attributes can be used directly as slice
942 indexes on Seq objects.
943
944 :Parameters:
945 start : non-negative integer
946 start of the domain on the sequence, using 0-based indexing
947 end : non-negative integer
948 end of the domain on the sequence
949 confidence : float
950 can be used to store e.g. E-values
951 id : string
952 unique identifier/name
953 """
954
955 - def __init__(self, value, start, end, confidence=None, id=None):
956 self.value = value
957 self.start = start
958 self.end = end
959 self.confidence = confidence
960 self.id = id
961
962 @classmethod
963 - def from_seqfeature(cls, feat):
964 return ProteinDomain(feat.id,
965 feat.location.nofuzzy_start,
966 feat.location.nofuzzy_end,
967 confidence=feat.qualifiers.get('confidence'))
968
969 - def to_seqfeature(self):
970 feat = SeqFeature(location=FeatureLocation(self.start, self.end),
971 id=self.value)
972 if hasattr(self, 'confidence'):
973 feat.qualifiers['confidence'] = self.confidence
974 return feat
975
978 """Literature reference for a clade.
979
980 NB: Whenever possible, use the ``doi`` attribute instead of the free-text
981 ``desc`` element.
982 """
983 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+')
984
985 - def __init__(self, doi=None, desc=None):
989
992 """A molecular sequence (Protein, DNA, RNA) associated with a node.
993
994 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the
995 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per
996 node.
997
998 :Parameters:
999 type : {'dna', 'rna', 'protein'}
1000 type of molecule this sequence represents
1001 id_ref : string
1002 reference to another resource
1003 id_source : string
1004 source for the reference
1005 symbol : string
1006 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars)
1007 accession : Accession
1008 accession code for this sequence.
1009 name : string
1010 full name of the sequence, e.g. 'muscle Actin'
1011 location
1012 location of a sequence on a genome/chromosome.
1013 mol_seq : MolSeq
1014 the molecular sequence itself
1015 uri : Uri
1016 link
1017 annotations : list of Annotation objects
1018 annotations on this sequence
1019 domain_architecture : DomainArchitecture
1020 protein domains on this sequence
1021 other : list of Other objects
1022 non-phyloXML elements
1023 """
1024 alphabets = {'dna': Alphabet.generic_dna,
1025 'rna': Alphabet.generic_rna,
1026 'protein': Alphabet.generic_protein}
1027 re_symbol = re.compile(r'\S{1,10}')
1028
1029 - def __init__(self,
1030
1031 type=None, id_ref=None, id_source=None,
1032
1033 symbol=None, accession=None, name=None, location=None,
1034 mol_seq=None, uri=None, domain_architecture=None,
1035
1036 annotations=None, other=None,
1037 ):
1052
1053 @classmethod
1055 """Create a new PhyloXML Sequence from a SeqRecord object."""
1056 if is_aligned == None:
1057 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped)
1058 params = {
1059 'accession': Accession(record.id, ''),
1060 'symbol': record.name,
1061 'name': record.description,
1062 'mol_seq': MolSeq(str(record.seq), is_aligned),
1063 }
1064 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
1065 params['type'] = 'dna'
1066 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
1067 params['type'] = 'rna'
1068 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
1069 params['type'] = 'protein'
1070
1071
1072 for key in ('id_ref', 'id_source', 'location'):
1073 if key in record.annotations:
1074 params[key] = record.annotations[key]
1075 if isinstance(record.annotations.get('uri'), dict):
1076 params['uri'] = Uri(**record.annotations['uri'])
1077
1078 if record.annotations.get('annotations'):
1079 params['annotations'] = []
1080 for annot in record.annotations['annotations']:
1081 ann_args = {}
1082 for key in ('ref', 'source', 'evidence', 'type', 'desc'):
1083 if key in annot:
1084 ann_args[key] = annot[key]
1085 if isinstance(annot.get('confidence'), list):
1086 ann_args['confidence'] = Confidence(
1087 *annot['confidence'])
1088 if isinstance(annot.get('properties'), list):
1089 ann_args['properties'] = [Property(**prop)
1090 for prop in annot['properties']
1091 if isinstance(prop, dict)]
1092 params['annotations'].append(Annotation(**ann_args))
1093
1094
1095 if record.features:
1096 params['domain_architecture'] = DomainArchitecture(
1097 length=len(record.seq),
1098 domains=[ProteinDomain.from_seqfeature(feat)
1099 for feat in record.features])
1100
1101 return Sequence(**params)
1102
1104 """Create a SeqRecord object from this Sequence instance.
1105
1106 The seqrecord.annotations dictionary is packed like so::
1107
1108 { # Sequence attributes with no SeqRecord equivalent:
1109 'id_ref': self.id_ref,
1110 'id_source': self.id_source,
1111 'location': self.location,
1112 'uri': { 'value': self.uri.value,
1113 'desc': self.uri.desc,
1114 'type': self.uri.type },
1115 # Sequence.annotations attribute (list of Annotations)
1116 'annotations': [{ 'ref': ann.ref,
1117 'source': ann.source,
1118 'evidence': ann.evidence,
1119 'type': ann.type,
1120 'confidence': [ ann.confidence.value,
1121 ann.confidence.type ],
1122 'properties': [{ 'value': prop.value,
1123 'ref': prop.ref,
1124 'applies_to': prop.applies_to,
1125 'datatype': prop.datatype,
1126 'unit': prop.unit,
1127 'id_ref': prop.id_ref }
1128 for prop in ann.properties],
1129 } for ann in self.annotations],
1130 }
1131 """
1132 def clean_dict(dct):
1133 """Remove None-valued items from a dictionary."""
1134 return dict((key, val) for key, val in dct.iteritems()
1135 if val is not None)
1136
1137 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
1138 **clean_dict({
1139 'id': str(self.accession),
1140 'name': self.symbol,
1141 'description': self.name,
1142
1143 }))
1144 if self.domain_architecture:
1145 seqrec.features = [dom.to_seqfeature()
1146 for dom in self.domain_architecture.domains]
1147
1148 seqrec.annotations = clean_dict({
1149 'id_ref': self.id_ref,
1150 'id_source': self.id_source,
1151 'location': self.location,
1152 'uri': self.uri and clean_dict({
1153 'value': self.uri.value,
1154 'desc': self.uri.desc,
1155 'type': self.uri.type,
1156 }),
1157 'annotations': self.annotations and [
1158 clean_dict({
1159 'ref': ann.ref,
1160 'source': ann.source,
1161 'evidence': ann.evidence,
1162 'type': ann.type,
1163 'confidence': ann.confidence and [
1164 ann.confidence.value,
1165 ann.confidence.type],
1166 'properties': [clean_dict({
1167 'value': prop.value,
1168 'ref': prop.ref,
1169 'applies_to': prop.applies_to,
1170 'datatype': prop.datatype,
1171 'unit': prop.unit,
1172 'id_ref': prop.id_ref })
1173 for prop in ann.properties],
1174 }) for ann in self.annotations],
1175 })
1176 return seqrec
1177
1183
1186 """Express a typed relationship between two sequences.
1187
1188 For example, this could be used to describe an orthology (in which case
1189 attribute 'type' is 'orthology').
1190
1191 :Parameters:
1192 id_ref_0 : Id
1193 first sequence reference identifier
1194 id_ref_1 : Id
1195 second sequence reference identifier
1196 distance : float
1197 distance between the two sequences
1198 type : restricted string
1199 describe the type of relationship
1200 confidence : Confidence
1201 confidence value for this relation
1202 """
1203 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology',
1204 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other'))
1205
1206 - def __init__(self, type, id_ref_0, id_ref_1,
1207 distance=None, confidence=None):
1214
1217 """Describe taxonomic information for a clade.
1218
1219 :Parameters:
1220 id_source : Id
1221 link other elements to a taxonomy (on the XML level)
1222 id : Id
1223 unique identifier of a taxon, e.g. Id('6500',
1224 provider='ncbi_taxonomy') for the California sea hare
1225 code : restricted string
1226 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the
1227 California sea hare 'Aplysia californica'
1228 scientific_name : string
1229 the standard scientific name for this organism, e.g. 'Aplysia
1230 californica' for the California sea hare
1231 authority : string
1232 keep the authority, such as 'J. G. Cooper, 1863', associated with
1233 the 'scientific_name'
1234 common_names : list of strings
1235 common names for this organism
1236 synonyms : list of strings
1237 synonyms for this taxon?
1238 rank : restricted string
1239 taxonomic rank
1240 uri : Uri
1241 link
1242 other : list of Other objects
1243 non-phyloXML elements
1244 """
1245 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}')
1246 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom',
1247 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum',
1248 'superdivision', 'division', 'subdivision', 'infradivision',
1249 'superclass', 'class', 'subclass', 'infraclass', 'superlegion',
1250 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort',
1251 'subcohort', 'infracohort', 'superorder', 'order', 'suborder',
1252 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe',
1253 'infratribe', 'genus', 'subgenus', 'superspecies', 'species',
1254 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar',
1255 'unknown', 'other'))
1256
1257 - def __init__(self,
1258
1259 id_source=None,
1260
1261 id=None, code=None, scientific_name=None, authority=None,
1262 rank=None, uri=None,
1263
1264 common_names=None, synonyms=None, other=None,
1265 ):
1278
1280 """Show the class name and an identifying attribute."""
1281 if self.code is not None:
1282 return self.code
1283 if self.scientific_name is not None:
1284 return self.scientific_name
1285 if self.rank is not None:
1286 return self.rank
1287 if self.id is not None:
1288 return str(self.id)
1289 return self.__class__.__name__
1290
1291
1292 -class Uri(PhyloElement):
1293 """A uniform resource identifier.
1294
1295 In general, this is expected to be an URL (for example, to link to an image
1296 on a website, in which case the ``type`` attribute might be 'image' and
1297 ``desc`` might be 'image of a California sea hare').
1298 """
1299 - def __init__(self, value, desc=None, type=None):
1303
1305 if self.value:
1306 return self.value
1307 return repr(self)
1308