Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 12  from standard phylip format in the following ways: 
 13   
 14   * No whitespace is allowed in the sequence ID. 
 15   * No truncation is performed. Instead, sequence IDs are padded to the longest 
 16     ID length, rather than 10 characters. A space separates the sequence 
 17     identifier from the sequence. 
 18   
 19  Relaxed phylip is supported by RAxML and PHYML. 
 20   
 21  Note 
 22  ==== 
 23  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 24  a dot/period (".") in a sequence is interpreted as meaning the same 
 25  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 26  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 27   
 28     "a period was also previously allowed but it is no longer allowed, 
 29     because it sometimes is used in different senses in other programs" 
 30   
 31  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 32  for reading and writing. Older versions did nothing special with a dot/period. 
 33  """ 
 34  import string 
 35   
 36  from Bio.Seq import Seq 
 37  from Bio.SeqRecord import SeqRecord 
 38  from Bio.Align import MultipleSeqAlignment 
 39  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 40   
 41  try: 
 42      any 
 43  except NameError: 
 44      #Hack for Python 2.4 
45 - def any(iterable):
46 for element in iterable: 47 if element: 48 return True 49 return False
50 51 _PHYLIP_ID_WIDTH = 10 52 53
54 -class PhylipWriter(SequentialAlignmentWriter):
55 """Phylip alignment writer.""" 56
57 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
58 """Use this to write (another) single alignment to an open file. 59 60 This code will write interlaced alignments (when the sequences are 61 longer than 50 characters). 62 63 Note that record identifiers are strictly truncated to id_width, 64 defaulting to the value required to comply with the PHYLIP standard. 65 66 For more information on the file format, please see: 67 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 68 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 69 """ 70 handle = self.handle 71 72 if len(alignment)==0: 73 raise ValueError("Must have at least one sequence") 74 length_of_seqs = alignment.get_alignment_length() 75 for record in alignment: 76 if length_of_seqs != len(record.seq): 77 raise ValueError("Sequences must all be the same length") 78 if length_of_seqs <= 0: 79 raise ValueError("Non-empty sequences are required") 80 81 # Check for repeated identifiers... 82 # Apply this test *after* cleaning the identifiers 83 names = [] 84 for record in alignment: 85 """ 86 Quoting the PHYLIP version 3.6 documentation: 87 88 The name should be ten characters in length, filled out to 89 the full ten characters by blanks if shorter. Any printable 90 ASCII/ISO character is allowed in the name, except for 91 parentheses ("(" and ")"), square brackets ("[" and "]"), 92 colon (":"), semicolon (";") and comma (","). If you forget 93 to extend the names to ten characters in length by blanks, 94 the program [i.e. PHYLIP] will get out of synchronization 95 with the contents of the data file, and an error message will 96 result. 97 98 Note that Tab characters count as only one character in the 99 species names. Their inclusion can cause trouble. 100 """ 101 name = record.id.strip() 102 #Either remove the banned characters, or map them to something 103 #else like an underscore "_" or pipe "|" character... 104 for char in "[](),": 105 name = name.replace(char,"") 106 for char in ":;": 107 name = name.replace(char,"|") 108 name = name[:id_width] 109 if name in names: 110 raise ValueError("Repeated name %r (originally %r), " 111 "possibly due to truncation" \ 112 % (name, record.id)) 113 names.append(name) 114 115 # From experimentation, the use of tabs is not understood by the 116 # EMBOSS suite. The nature of the expected white space is not 117 # defined in the PHYLIP documentation, simply "These are in free 118 # format, separated by blanks". We'll use spaces to keep EMBOSS 119 # happy. 120 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 121 block=0 122 while True: 123 for name, record in zip(names, alignment): 124 if block==0: 125 #Write name (truncated/padded to id_width characters) 126 #Now truncate and right pad to expected length. 127 handle.write(name[:id_width].ljust(id_width)) 128 else: 129 #write indent 130 handle.write(" " * id_width) 131 #Write five chunks of ten letters per line... 132 sequence = str(record.seq) 133 if "." in sequence: 134 raise ValueError("PHYLIP format no longer allows dots in " 135 "sequence") 136 for chunk in range(0,5): 137 i = block*50 + chunk*10 138 seq_segment = sequence[i:i+10] 139 #TODO - Force any gaps to be '-' character? Look at the 140 #alphabet... 141 #TODO - How to cope with '?' or '.' in the sequence? 142 handle.write(" %s" % seq_segment) 143 if i+10 > length_of_seqs : break 144 handle.write("\n") 145 block=block+1 146 if block*50 > length_of_seqs : break 147 handle.write("\n")
148
149 -class PhylipIterator(AlignmentIterator):
150 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 151 152 Record identifiers are limited to at most 10 characters. 153 154 It only copes with interlaced phylip files! Sequential files won't work 155 where the sequences are split over multiple lines. 156 157 For more information on the file format, please see: 158 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 159 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 160 """ 161 162 # Default truncation length 163 id_width = _PHYLIP_ID_WIDTH 164
165 - def _is_header(self, line):
166 line = line.strip() 167 parts = filter(None, line.split()) 168 if len(parts)!=2: 169 return False # First line should have two integers 170 try: 171 number_of_seqs = int(parts[0]) 172 length_of_seqs = int(parts[1]) 173 return True 174 except ValueError: 175 return False # First line should have two integers
176
177 - def _split_id(self, line):
178 """ 179 Extracts the sequence ID from a Phylip line, returning a tuple 180 containing: 181 182 (sequence_id, sequence_residues) 183 184 The first 10 characters in the line are are the sequence id, the 185 remainder are sequence data. 186 """ 187 seq_id = line[:self.id_width].strip() 188 seq = line[self.id_width:].strip().replace(' ', '') 189 return seq_id, seq
190
191 - def next(self):
192 handle = self.handle 193 194 try: 195 #Header we saved from when we were parsing 196 #the previous alignment. 197 line = self._header 198 del self._header 199 except AttributeError: 200 line = handle.readline() 201 202 if not line: 203 raise StopIteration 204 line = line.strip() 205 parts = filter(None, line.split()) 206 if len(parts)!=2: 207 raise ValueError("First line should have two integers") 208 try: 209 number_of_seqs = int(parts[0]) 210 length_of_seqs = int(parts[1]) 211 except ValueError: 212 raise ValueError("First line should have two integers") 213 214 assert self._is_header(line) 215 216 if self.records_per_alignment is not None \ 217 and self.records_per_alignment != number_of_seqs: 218 raise ValueError("Found %i records in this alignment, told to expect %i" \ 219 % (number_of_seqs, self.records_per_alignment)) 220 221 ids = [] 222 seqs = [] 223 224 # By default, expects STRICT truncation / padding to 10 characters. 225 # Does not require any whitespace between name and seq. 226 for i in xrange(number_of_seqs): 227 line = handle.readline().rstrip() 228 sequence_id, s = self._split_id(line) 229 ids.append(sequence_id) 230 if "." in s: 231 raise ValueError("PHYLIP format no longer allows dots in sequence") 232 seqs.append([s]) 233 234 #Look for further blocks 235 line="" 236 while True: 237 #Skip any blank lines between blocks... 238 while ""==line.strip(): 239 line = handle.readline() 240 if not line : break #end of file 241 if not line : break #end of file 242 243 if self._is_header(line): 244 #Looks like the start of a concatenated alignment 245 self._header = line 246 break 247 248 #print "New block..." 249 for i in xrange(number_of_seqs): 250 s = line.strip().replace(" ","") 251 if "." in s: 252 raise ValueError("PHYLIP format no longer allows dots in sequence") 253 seqs[i].append(s) 254 line = handle.readline() 255 if (not line) and i+1 < number_of_seqs: 256 raise ValueError("End of file mid-block") 257 if not line : break #end of file 258 259 records = (SeqRecord(Seq("".join(s), self.alphabet), \ 260 id=i, name=i, description=i) \ 261 for (i,s) in zip(ids, seqs)) 262 return MultipleSeqAlignment(records, self.alphabet)
263 264 # Relaxed Phylip
265 -class RelaxedPhylipWriter(PhylipWriter):
266 """ 267 Relaxed Phylip format writer 268 """ 269
270 - def write_alignment(self, alignment):
271 """ 272 Write a relaxed phylip alignment 273 """ 274 # Check inputs 275 for name in (s.id.strip() for s in alignment): 276 if any(c in name for c in string.whitespace): 277 raise ValueError("Whitespace not allowed in identifier: %s" 278 % name) 279 280 # Calculate a truncation length - maximum length of sequence ID plus a 281 # single character for padding 282 # If no sequences, set id_width to 1. super(...) call will raise a 283 # ValueError 284 if len(alignment) == 0: 285 id_width = 1 286 else: 287 id_width = max((len(s.id.strip()) for s in alignment)) + 1 288 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
289 290
291 -class RelaxedPhylipIterator(PhylipIterator):
292 """ 293 Relaxed Phylip format Iterator 294 """ 295
296 - def _split_id(self, line):
297 """Returns the ID, sequence data from a line 298 Extracts the sequence ID from a Phylip line, returning a tuple 299 containing: 300 301 (sequence_id, sequence_residues) 302 303 For relaxed format - split at the first whitespace character 304 """ 305 seq_id, sequence = line.split(None, 1) 306 sequence = sequence.strip().replace(" ", "") 307 return seq_id, sequence
308 309 310 if __name__=="__main__": 311 print "Running short mini-test" 312 313 phylip_text=""" 8 286 314 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 315 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 316 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 317 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 318 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 319 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 320 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 321 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 322 323 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 324 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 325 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 326 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 327 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 328 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 329 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 330 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 331 332 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 333 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 334 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 335 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 336 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 337 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 338 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 339 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 340 341 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 342 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 343 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 344 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 345 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 346 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 347 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 348 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 349 350 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 351 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 352 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 353 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 354 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 355 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 356 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 357 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 358 359 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 360 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 361 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 362 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 363 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 364 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 365 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 366 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 367 """ 368 369 from cStringIO import StringIO 370 handle = StringIO(phylip_text) 371 count=0 372 for alignment in PhylipIterator(handle): 373 for record in alignment: 374 count=count+1 375 print record.id 376 #print record.seq.tostring() 377 assert count == 8 378 379 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 380 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 381 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 382 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 383 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper() 384 assert record.seq.tostring().replace("-","") == expected 385 386 #From here: 387 #http://atgc.lirmm.fr/phyml/usersguide.html 388 phylip_text2="""5 60 389 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 390 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 391 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 392 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 393 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 394 395 GAAATGGTCAATATTACAAGGT 396 GAAATGGTCAACATTAAAAGAT 397 GAAATCGTCAATATTAAAAGGT 398 GAAATGGTCAATCTTAAAAGGT 399 GAAATGGTCAATATTAAAAGGT""" 400 401 phylip_text3="""5 60 402 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 403 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 404 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 405 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 406 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 407 408 handle = StringIO(phylip_text2) 409 list2 = list(PhylipIterator(handle)) 410 handle.close() 411 assert len(list2)==1 412 assert len(list2[0])==5 413 414 handle = StringIO(phylip_text3) 415 list3 = list(PhylipIterator(handle)) 416 handle.close() 417 assert len(list3)==1 418 assert len(list3[0])==5 419 420 for i in range(0,5): 421 list2[0][i].id == list3[0][i].id 422 list2[0][i].seq.tostring() == list3[0][i].seq.tostring() 423 424 #From here: 425 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 426 #Note the lack of any white space between names 2 and 3 and their seqs. 427 phylip_text4=""" 5 42 428 Turkey AAGCTNGGGC ATTTCAGGGT 429 Salmo gairAAGCCTTGGC AGTGCAGGGT 430 H. SapiensACCGGTTGGC CGTTCAGGGT 431 Chimp AAACCCTTGC CGTTACGCTT 432 Gorilla AAACCCTTGC CGGTACGCTT 433 434 GAGCCCGGGC AATACAGGGT AT 435 GAGCCGTGGC CGGGCACGGT AT 436 ACAGGTTGGC CGTTCAGGGT AA 437 AAACCGAGGC CGGGACACTC AT 438 AAACCATTGC CGGTACGCTT AA""" 439 440 #From here: 441 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 442 phylip_text5=""" 5 42 443 Turkey AAGCTNGGGC ATTTCAGGGT 444 GAGCCCGGGC AATACAGGGT AT 445 Salmo gairAAGCCTTGGC AGTGCAGGGT 446 GAGCCGTGGC CGGGCACGGT AT 447 H. SapiensACCGGTTGGC CGTTCAGGGT 448 ACAGGTTGGC CGTTCAGGGT AA 449 Chimp AAACCCTTGC CGTTACGCTT 450 AAACCGAGGC CGGGACACTC AT 451 Gorilla AAACCCTTGC CGGTACGCTT 452 AAACCATTGC CGGTACGCTT AA""" 453 454 phylip_text5a=""" 5 42 455 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 456 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 457 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 458 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 459 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 460 461 handle = StringIO(phylip_text4) 462 list4 = list(PhylipIterator(handle)) 463 handle.close() 464 assert len(list4)==1 465 assert len(list4[0])==5 466 467 handle = StringIO(phylip_text5) 468 try: 469 list5 = list(PhylipIterator(handle)) 470 assert len(list5)==1 471 assert len(list5[0])==5 472 print "That should have failed..." 473 except ValueError: 474 print "Evil multiline non-interlaced example failed as expected" 475 handle.close() 476 477 handle = StringIO(phylip_text5a) 478 list5 = list(PhylipIterator(handle)) 479 handle.close() 480 assert len(list5)==1 481 assert len(list4[0])==5 482 483 print "Concatenation" 484 handle = StringIO(phylip_text4 + "\n" + phylip_text4) 485 assert len(list(PhylipIterator(handle))) == 2 486 487 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text) 488 assert len(list(PhylipIterator(handle))) == 3 489 490 print "OK" 491 492 print "Checking write/read" 493 handle = StringIO() 494 PhylipWriter(handle).write_file(list5) 495 handle.seek(0) 496 list6 = list(PhylipIterator(handle)) 497 assert len(list5) == len(list6) 498 for a1,a2 in zip(list5, list6): 499 assert len(a1) == len(a2) 500 for r1, r2 in zip(a1, a2): 501 assert r1.id == r2.id 502 assert r1.seq.tostring() == r2.seq.tostring() 503 print "Done" 504