Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11  """ 
 12   
 13  from Bio.Seq import Seq 
 14  from Bio.SeqRecord import SeqRecord 
 15  from Bio.Align import MultipleSeqAlignment 
 16  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 17   
18 -class ClustalWriter(SequentialAlignmentWriter):
19 """Clustalw alignment writer."""
20 - def write_alignment(self, alignment):
21 """Use this to write (another) single alignment to an open file.""" 22 23 if len(alignment) == 0: 24 raise ValueError("Must have at least one sequence") 25 if alignment.get_alignment_length() == 0: 26 #This doubles as a check for an alignment object 27 raise ValueError("Non-empty sequences are required") 28 29 #Old versions of the parser in Bio.Clustalw used a ._version property, 30 try: 31 version = str(alignment._version) 32 except AttributeError: 33 version = "" 34 if not version: 35 version = '1.81' 36 if version.startswith("2."): 37 #e.g. 2.0.x 38 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 39 else: 40 #e.g. 1.81 or 1.83 41 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 42 43 cur_char = 0 44 max_length = len(alignment[0]) 45 46 if max_length <= 0: 47 raise ValueError("Non-empty sequences are required") 48 49 # keep displaying sequences until we reach the end 50 while cur_char != max_length: 51 # calculate the number of sequences to show, which will 52 # be less if we are at the end of the sequence 53 if (cur_char + 50) > max_length: 54 show_num = max_length - cur_char 55 else: 56 show_num = 50 57 58 # go through all of the records and print out the sequences 59 # when we output, we do a nice 80 column output, although this 60 # may result in truncation of the ids. 61 for record in alignment: 62 #Make sure we don't get any spaces in the record 63 #identifier when output in the file by replacing 64 #them with underscores: 65 line = record.id[0:30].replace(" ","_").ljust(36) 66 line += record.seq[cur_char:(cur_char + show_num)].tostring() 67 output += line + "\n" 68 69 # now we need to print out the star info, if we've got it 70 # This was stored by Bio.Clustalw using a ._star_info property. 71 if hasattr(alignment, "_star_info") and alignment._star_info != '': 72 output += (" " * 36) + \ 73 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 74 75 output += "\n" 76 cur_char += show_num 77 78 # Want a trailing blank new line in case the output is concatenated 79 self.handle.write(output + "\n")
80
81 -class ClustalIterator(AlignmentIterator):
82 """Clustalw alignment iterator.""" 83
84 - def next(self):
85 handle = self.handle 86 try: 87 #Header we saved from when we were parsing 88 #the previous alignment. 89 line = self._header 90 del self._header 91 except AttributeError: 92 line = handle.readline() 93 if not line: 94 raise StopIteration 95 96 #Whitelisted headers we know about 97 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] 98 if line.strip().split()[0] not in known_headers: 99 raise ValueError("%s is not a known CLUSTAL header: %s" % \ 100 (line.strip().split()[0], 101 ", ".join(known_headers))) 102 103 # find the clustal version in the header line 104 version = None 105 for word in line.split(): 106 if word[0]=='(' and word[-1]==')': 107 word = word[1:-1] 108 if word[0] in '0123456789': 109 version = word 110 break 111 112 #There should be two blank lines after the header line 113 line = handle.readline() 114 while line.strip() == "": 115 line = handle.readline() 116 117 #If the alignment contains entries with the same sequence 118 #identifier (not a good idea - but seems possible), then this 119 #dictionary based parser will merge their sequences. Fix this? 120 ids = [] 121 seqs = [] 122 consensus = "" 123 seq_cols = None #: Used to extract the consensus 124 125 #Use the first block to get the sequence identifiers 126 while True: 127 if line[0] != " " and line.strip() != "": 128 #Sequences identifier... 129 fields = line.rstrip().split() 130 131 #We expect there to be two fields, there can be an optional 132 #"sequence number" field containing the letter count. 133 if len(fields) < 2 or len(fields) > 3: 134 raise ValueError("Could not parse line:\n%s" % line) 135 136 ids.append(fields[0]) 137 seqs.append(fields[1]) 138 139 #Record the sequence position to get the consensus 140 if seq_cols is None: 141 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 142 end = start + len(fields[1]) 143 seq_cols = slice(start, end) 144 del start, end 145 assert fields[1] == line[seq_cols] 146 147 if len(fields) == 3: 148 #This MAY be an old style file with a letter count... 149 try: 150 letters = int(fields[2]) 151 except ValueError: 152 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 153 if len(fields[1].replace("-","")) != letters: 154 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 155 elif line[0] == " ": 156 #Sequence consensus line... 157 assert len(ids) == len(seqs) 158 assert len(ids) > 0 159 assert seq_cols is not None 160 consensus = line[seq_cols] 161 assert not line[:seq_cols.start].strip() 162 assert not line[seq_cols.stop:].strip() 163 #Check for blank line (or end of file) 164 line = handle.readline() 165 assert line.strip() == "" 166 break 167 else: 168 #No consensus 169 break 170 line = handle.readline() 171 if not line : break #end of file 172 173 assert line.strip() == "" 174 assert seq_cols is not None 175 176 #Confirm all same length 177 for s in seqs: 178 assert len(s) == len(seqs[0]) 179 if consensus: 180 assert len(consensus) == len(seqs[0]) 181 182 #Loop over any remaining blocks... 183 done = False 184 while not done: 185 #There should be a blank line between each block. 186 #Also want to ignore any consensus line from the 187 #previous block. 188 while (not line) or line.strip() == "": 189 line = handle.readline() 190 if not line : break # end of file 191 if not line : break # end of file 192 193 if line.split(None,1)[0] in known_headers: 194 #Found concatenated alignment. 195 done = True 196 self._header = line 197 break 198 199 for i in range(len(ids)): 200 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 201 fields = line.rstrip().split() 202 203 #We expect there to be two fields, there can be an optional 204 #"sequence number" field containing the letter count. 205 if len(fields) < 2 or len(fields) > 3: 206 raise ValueError("Could not parse line:\n%s" % repr(line)) 207 208 if fields[0] != ids[i]: 209 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ 210 % (fields[0], ids[i])) 211 212 if fields[1] != line[seq_cols]: 213 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 214 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 215 end = start + len(fields[1]) 216 seq_cols = slice(start, end) 217 del start, end 218 219 #Append the sequence 220 seqs[i] += fields[1] 221 assert len(seqs[i]) == len(seqs[0]) 222 223 if len(fields) == 3: 224 #This MAY be an old style file with a letter count... 225 try: 226 letters = int(fields[2]) 227 except ValueError: 228 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 229 if len(seqs[i].replace("-","")) != letters: 230 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 231 232 #Read in the next line 233 line = handle.readline() 234 #There should now be a consensus line 235 if consensus: 236 assert line[0] == " " 237 assert seq_cols is not None 238 consensus += line[seq_cols] 239 assert len(consensus) == len(seqs[0]) 240 assert not line[:seq_cols.start].strip() 241 assert not line[seq_cols.stop:].strip() 242 #Read in the next line 243 line = handle.readline() 244 245 246 assert len(ids) == len(seqs) 247 if len(seqs) == 0 or len(seqs[0]) == 0: 248 raise StopIteration 249 250 if self.records_per_alignment is not None \ 251 and self.records_per_alignment != len(ids): 252 raise ValueError("Found %i records in this alignment, told to expect %i" \ 253 % (len(ids), self.records_per_alignment)) 254 255 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) \ 256 for (i,s) in zip(ids, seqs)) 257 alignment = MultipleSeqAlignment(records, self.alphabet) 258 #TODO - Handle alignment annotation better, for now 259 #mimic the old parser in Bio.Clustalw 260 if version: 261 alignment._version = version 262 if consensus: 263 alignment_length = len(seqs[0]) 264 assert len(consensus) == alignment_length, \ 265 "Alignment length is %i, consensus length is %i, '%s'" \ 266 % (alignment_length, len(consensus), consensus) 267 alignment._star_info = consensus 268 return alignment
269 270 if __name__ == "__main__": 271 print "Running a quick self-test" 272 273 #This is a truncated version of the example in Tests/cw02.aln 274 #Notice the inclusion of sequence numbers (right hand side) 275 aln_example1 = \ 276 """CLUSTAL W (1.81) multiple sequence alignment 277 278 279 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 280 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 281 * *: :: :. :* : :. : . :* :: . 282 283 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 284 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 285 : ** **:... *.*** .. 286 287 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 288 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 289 .:* * *: .* :* : :* .* 290 291 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 292 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 293 *::. . .:: :*..* :* .* .. . : . : 294 295 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 296 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 297 *. .:: : . 298 299 """ 300 301 #This example is a truncated version of the dataset used here: 302 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 303 #with the last record repeated twice (deliberate toture test) 304 aln_example2 = \ 305 """CLUSTAL X (1.83) multiple sequence alignment 306 307 308 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 309 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 310 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 311 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 312 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 313 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 314 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 315 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 316 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 317 : . : :. 318 319 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 320 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 321 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 322 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 323 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 324 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 325 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 326 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 327 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 328 ** .: *::::. : :. . ..: 329 330 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 331 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 332 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 333 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 334 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 335 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 336 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 337 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 338 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 339 *.: . * . * *: : 340 341 """ 342 343 from StringIO import StringIO 344 345 alignments = list(ClustalIterator(StringIO(aln_example1))) 346 assert 1 == len(alignments) 347 assert alignments[0]._version == "1.81" 348 alignment = alignments[0] 349 assert 2 == len(alignment) 350 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069" 351 assert alignment[1].id == "gi|671626|emb|CAA85685.1|" 352 assert alignment[0].seq.tostring() == \ 353 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 354 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 355 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 356 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 357 "VPTTRAQRRA" 358 359 alignments = list(ClustalIterator(StringIO(aln_example2))) 360 assert 1 == len(alignments) 361 assert alignments[0]._version == "1.83" 362 alignment = alignments[0] 363 assert 9 == len(alignment) 364 assert alignment[-1].id == "HISJ_E_COLI" 365 assert alignment[-1].seq.tostring() == \ 366 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 367 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 368 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 369 370 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)): 371 print "Alignment with %i records of length %i" \ 372 % (len(alignment), 373 alignment.get_alignment_length()) 374 375 print "Checking empty file..." 376 assert 0 == len(list(ClustalIterator(StringIO("")))) 377 378 print "Checking write/read..." 379 alignments = list(ClustalIterator(StringIO(aln_example1))) \ 380 + list(ClustalIterator(StringIO(aln_example2)))*2 381 handle = StringIO() 382 ClustalWriter(handle).write_file(alignments) 383 handle.seek(0) 384 for i,a in enumerate(ClustalIterator(handle)): 385 assert a.get_alignment_length() == alignments[i].get_alignment_length() 386 handle.seek(0) 387 388 print "Testing write/read when there is only one sequence..." 389 alignment = alignment[0:1] 390 handle = StringIO() 391 ClustalWriter(handle).write_file([alignment]) 392 handle.seek(0) 393 for i,a in enumerate(ClustalIterator(handle)): 394 assert a.get_alignment_length() == alignment.get_alignment_length() 395 assert len(a) == 1 396 397 aln_example3 = \ 398 """CLUSTAL 2.0.9 multiple sequence alignment 399 400 401 Test1seq ------------------------------------------------------------ 402 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC 403 AT3G20900.1-CDS ------------------------------------------------------------ 404 405 406 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT 407 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT 408 AT3G20900.1-CDS ------------------------------------------------------------ 409 410 411 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 412 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 413 AT3G20900.1-CDS ------------------------------------------------------------ 414 415 416 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA 417 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA 418 AT3G20900.1-CDS ------------------------------------------------------------ 419 420 421 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA 422 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA 423 AT3G20900.1-CDS ------------------------------------------------------------ 424 425 426 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 427 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 428 AT3G20900.1-CDS ------------------------------------------------------------ 429 430 431 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 432 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 433 AT3G20900.1-CDS ------------------------------------------------------------ 434 435 436 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 437 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 438 AT3G20900.1-CDS ------------------------------------------------------ATGAAC 439 * 440 441 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 442 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 443 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT 444 * *** ***** * * ** **************************** 445 446 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 447 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 448 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 449 ******* ** * **** *************************************** 450 451 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT 452 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 453 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 454 **************************************** ******************* 455 456 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT- 457 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG 458 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG 459 ************************* 460 """ 461 alignments = list(ClustalIterator(StringIO(aln_example3))) 462 assert 1 == len(alignments) 463 assert alignments[0]._version == "2.0.9" 464 465 print "The End" 466