1
2
3
4
5 """
6 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 Support for "relaxed phylip" format is also provided. Relaxed phylip differs
12 from standard phylip format in the following ways:
13
14 * No whitespace is allowed in the sequence ID.
15 * No truncation is performed. Instead, sequence IDs are padded to the longest
16 ID length, rather than 10 characters. A space separates the sequence
17 identifier from the sequence.
18
19 Relaxed phylip is supported by RAxML and PHYML.
20
21 Note
22 ====
23 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
24 a dot/period (".") in a sequence is interpreted as meaning the same
25 character as in the first sequence. The PHYLIP documentation from 3.3 to 3.69
26 http://evolution.genetics.washington.edu/phylip/doc/sequence.html says:
27
28 "a period was also previously allowed but it is no longer allowed,
29 because it sometimes is used in different senses in other programs"
30
31 Biopython 1.58 or later treats dots/periods in the sequence as invalid, both
32 for reading and writing. Older versions did nothing special with a dot/period.
33 """
34 import string
35
36 from Bio.Seq import Seq
37 from Bio.SeqRecord import SeqRecord
38 from Bio.Align import MultipleSeqAlignment
39 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
40
41 try:
42 any
43 except NameError:
44
46 for element in iterable:
47 if element:
48 return True
49 return False
50
51 _PHYLIP_ID_WIDTH = 10
52
53
55 """Phylip alignment writer."""
56
58 """Use this to write (another) single alignment to an open file.
59
60 This code will write interlaced alignments (when the sequences are
61 longer than 50 characters).
62
63 Note that record identifiers are strictly truncated to id_width,
64 defaulting to the value required to comply with the PHYLIP standard.
65
66 For more information on the file format, please see:
67 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
68 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
69 """
70 handle = self.handle
71
72 if len(alignment)==0:
73 raise ValueError("Must have at least one sequence")
74 length_of_seqs = alignment.get_alignment_length()
75 for record in alignment:
76 if length_of_seqs != len(record.seq):
77 raise ValueError("Sequences must all be the same length")
78 if length_of_seqs <= 0:
79 raise ValueError("Non-empty sequences are required")
80
81
82
83 names = []
84 for record in alignment:
85 """
86 Quoting the PHYLIP version 3.6 documentation:
87
88 The name should be ten characters in length, filled out to
89 the full ten characters by blanks if shorter. Any printable
90 ASCII/ISO character is allowed in the name, except for
91 parentheses ("(" and ")"), square brackets ("[" and "]"),
92 colon (":"), semicolon (";") and comma (","). If you forget
93 to extend the names to ten characters in length by blanks,
94 the program [i.e. PHYLIP] will get out of synchronization
95 with the contents of the data file, and an error message will
96 result.
97
98 Note that Tab characters count as only one character in the
99 species names. Their inclusion can cause trouble.
100 """
101 name = record.id.strip()
102
103
104 for char in "[](),":
105 name = name.replace(char,"")
106 for char in ":;":
107 name = name.replace(char,"|")
108 name = name[:id_width]
109 if name in names:
110 raise ValueError("Repeated name %r (originally %r), "
111 "possibly due to truncation" \
112 % (name, record.id))
113 names.append(name)
114
115
116
117
118
119
120 handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
121 block=0
122 while True:
123 for name, record in zip(names, alignment):
124 if block==0:
125
126
127 handle.write(name[:id_width].ljust(id_width))
128 else:
129
130 handle.write(" " * id_width)
131
132 sequence = str(record.seq)
133 if "." in sequence:
134 raise ValueError("PHYLIP format no longer allows dots in "
135 "sequence")
136 for chunk in range(0,5):
137 i = block*50 + chunk*10
138 seq_segment = sequence[i:i+10]
139
140
141
142 handle.write(" %s" % seq_segment)
143 if i+10 > length_of_seqs : break
144 handle.write("\n")
145 block=block+1
146 if block*50 > length_of_seqs : break
147 handle.write("\n")
148
150 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator.
151
152 Record identifiers are limited to at most 10 characters.
153
154 It only copes with interlaced phylip files! Sequential files won't work
155 where the sequences are split over multiple lines.
156
157 For more information on the file format, please see:
158 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
159 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
160 """
161
162
163 id_width = _PHYLIP_ID_WIDTH
164
166 line = line.strip()
167 parts = filter(None, line.split())
168 if len(parts)!=2:
169 return False
170 try:
171 number_of_seqs = int(parts[0])
172 length_of_seqs = int(parts[1])
173 return True
174 except ValueError:
175 return False
176
178 """
179 Extracts the sequence ID from a Phylip line, returning a tuple
180 containing:
181
182 (sequence_id, sequence_residues)
183
184 The first 10 characters in the line are are the sequence id, the
185 remainder are sequence data.
186 """
187 seq_id = line[:self.id_width].strip()
188 seq = line[self.id_width:].strip().replace(' ', '')
189 return seq_id, seq
190
192 handle = self.handle
193
194 try:
195
196
197 line = self._header
198 del self._header
199 except AttributeError:
200 line = handle.readline()
201
202 if not line:
203 raise StopIteration
204 line = line.strip()
205 parts = filter(None, line.split())
206 if len(parts)!=2:
207 raise ValueError("First line should have two integers")
208 try:
209 number_of_seqs = int(parts[0])
210 length_of_seqs = int(parts[1])
211 except ValueError:
212 raise ValueError("First line should have two integers")
213
214 assert self._is_header(line)
215
216 if self.records_per_alignment is not None \
217 and self.records_per_alignment != number_of_seqs:
218 raise ValueError("Found %i records in this alignment, told to expect %i" \
219 % (number_of_seqs, self.records_per_alignment))
220
221 ids = []
222 seqs = []
223
224
225
226 for i in xrange(number_of_seqs):
227 line = handle.readline().rstrip()
228 sequence_id, s = self._split_id(line)
229 ids.append(sequence_id)
230 if "." in s:
231 raise ValueError("PHYLIP format no longer allows dots in sequence")
232 seqs.append([s])
233
234
235 line=""
236 while True:
237
238 while ""==line.strip():
239 line = handle.readline()
240 if not line : break
241 if not line : break
242
243 if self._is_header(line):
244
245 self._header = line
246 break
247
248
249 for i in xrange(number_of_seqs):
250 s = line.strip().replace(" ","")
251 if "." in s:
252 raise ValueError("PHYLIP format no longer allows dots in sequence")
253 seqs[i].append(s)
254 line = handle.readline()
255 if (not line) and i+1 < number_of_seqs:
256 raise ValueError("End of file mid-block")
257 if not line : break
258
259 records = (SeqRecord(Seq("".join(s), self.alphabet), \
260 id=i, name=i, description=i) \
261 for (i,s) in zip(ids, seqs))
262 return MultipleSeqAlignment(records, self.alphabet)
263
264
266 """
267 Relaxed Phylip format writer
268 """
269
271 """
272 Write a relaxed phylip alignment
273 """
274
275 for name in (s.id.strip() for s in alignment):
276 if any(c in name for c in string.whitespace):
277 raise ValueError("Whitespace not allowed in identifier: %s"
278 % name)
279
280
281
282
283
284 if len(alignment) == 0:
285 id_width = 1
286 else:
287 id_width = max((len(s.id.strip()) for s in alignment)) + 1
288 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
289
290
292 """
293 Relaxed Phylip format Iterator
294 """
295
297 """Returns the ID, sequence data from a line
298 Extracts the sequence ID from a Phylip line, returning a tuple
299 containing:
300
301 (sequence_id, sequence_residues)
302
303 For relaxed format - split at the first whitespace character
304 """
305 seq_id, sequence = line.split(None, 1)
306 sequence = sequence.strip().replace(" ", "")
307 return seq_id, sequence
308
309
310 if __name__=="__main__":
311 print "Running short mini-test"
312
313 phylip_text=""" 8 286
314 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
315 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
316 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
317 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
318 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
319 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
320 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
321 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
322
323 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
324 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
325 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
326 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
327 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
328 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
329 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
330 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
331
332 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
333 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
334 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
335 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
336 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
337 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
338 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
339 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
340
341 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
342 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
343 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
344 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
345 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
346 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
347 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
348 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
349
350 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
351 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
352 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
353 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
354 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
355 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
356 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
357 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
358
359 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
360 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
361 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
362 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
363 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
364 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
365 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
366 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
367 """
368
369 from cStringIO import StringIO
370 handle = StringIO(phylip_text)
371 count=0
372 for alignment in PhylipIterator(handle):
373 for record in alignment:
374 count=count+1
375 print record.id
376
377 assert count == 8
378
379 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
380 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
381 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
382 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
383 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
384 assert record.seq.tostring().replace("-","") == expected
385
386
387
388 phylip_text2="""5 60
389 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
390 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
391 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
392 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
393 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
394
395 GAAATGGTCAATATTACAAGGT
396 GAAATGGTCAACATTAAAAGAT
397 GAAATCGTCAATATTAAAAGGT
398 GAAATGGTCAATCTTAAAAGGT
399 GAAATGGTCAATATTAAAAGGT"""
400
401 phylip_text3="""5 60
402 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
403 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
404 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
405 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
406 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
407
408 handle = StringIO(phylip_text2)
409 list2 = list(PhylipIterator(handle))
410 handle.close()
411 assert len(list2)==1
412 assert len(list2[0])==5
413
414 handle = StringIO(phylip_text3)
415 list3 = list(PhylipIterator(handle))
416 handle.close()
417 assert len(list3)==1
418 assert len(list3[0])==5
419
420 for i in range(0,5):
421 list2[0][i].id == list3[0][i].id
422 list2[0][i].seq.tostring() == list3[0][i].seq.tostring()
423
424
425
426
427 phylip_text4=""" 5 42
428 Turkey AAGCTNGGGC ATTTCAGGGT
429 Salmo gairAAGCCTTGGC AGTGCAGGGT
430 H. SapiensACCGGTTGGC CGTTCAGGGT
431 Chimp AAACCCTTGC CGTTACGCTT
432 Gorilla AAACCCTTGC CGGTACGCTT
433
434 GAGCCCGGGC AATACAGGGT AT
435 GAGCCGTGGC CGGGCACGGT AT
436 ACAGGTTGGC CGTTCAGGGT AA
437 AAACCGAGGC CGGGACACTC AT
438 AAACCATTGC CGGTACGCTT AA"""
439
440
441
442 phylip_text5=""" 5 42
443 Turkey AAGCTNGGGC ATTTCAGGGT
444 GAGCCCGGGC AATACAGGGT AT
445 Salmo gairAAGCCTTGGC AGTGCAGGGT
446 GAGCCGTGGC CGGGCACGGT AT
447 H. SapiensACCGGTTGGC CGTTCAGGGT
448 ACAGGTTGGC CGTTCAGGGT AA
449 Chimp AAACCCTTGC CGTTACGCTT
450 AAACCGAGGC CGGGACACTC AT
451 Gorilla AAACCCTTGC CGGTACGCTT
452 AAACCATTGC CGGTACGCTT AA"""
453
454 phylip_text5a=""" 5 42
455 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
456 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
457 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
458 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
459 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
460
461 handle = StringIO(phylip_text4)
462 list4 = list(PhylipIterator(handle))
463 handle.close()
464 assert len(list4)==1
465 assert len(list4[0])==5
466
467 handle = StringIO(phylip_text5)
468 try:
469 list5 = list(PhylipIterator(handle))
470 assert len(list5)==1
471 assert len(list5[0])==5
472 print "That should have failed..."
473 except ValueError:
474 print "Evil multiline non-interlaced example failed as expected"
475 handle.close()
476
477 handle = StringIO(phylip_text5a)
478 list5 = list(PhylipIterator(handle))
479 handle.close()
480 assert len(list5)==1
481 assert len(list4[0])==5
482
483 print "Concatenation"
484 handle = StringIO(phylip_text4 + "\n" + phylip_text4)
485 assert len(list(PhylipIterator(handle))) == 2
486
487 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
488 assert len(list(PhylipIterator(handle))) == 3
489
490 print "OK"
491
492 print "Checking write/read"
493 handle = StringIO()
494 PhylipWriter(handle).write_file(list5)
495 handle.seek(0)
496 list6 = list(PhylipIterator(handle))
497 assert len(list5) == len(list6)
498 for a1,a2 in zip(list5, list6):
499 assert len(a1) == len(a2)
500 for r1, r2 in zip(a1, a2):
501 assert r1.id == r2.id
502 assert r1.seq.tostring() == r2.seq.tostring()
503 print "Done"
504