1
2
3
4
5 """
6 Bio.AlignIO support for the "stockholm" format (used in the PFAM database).
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 For example, consider a Stockholm alignment file containing the following::
12
13 # STOCKHOLM 1.0
14 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
15 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
16 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
17 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
18 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
19
20 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
21 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
22 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
23 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
24 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
25 //
26
27 This is a single multiple sequence alignment, so you would probably load this
28 using the Bio.AlignIO.read() function:
29
30 >>> from Bio import AlignIO
31 >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm")
32 >>> print align
33 SingleLetterAlphabet() alignment with 2 rows and 104 columns
34 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
35 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
36 >>> for record in align:
37 ... print record.id, len(record)
38 AP001509.1 104
39 AE007476.1 104
40
41 This example file is clearly using RNA, so you might want the alignment object
42 (and the SeqRecord objects it holds) to reflect this, rather than simple using
43 the default single letter alphabet as shown above. You can do this with an
44 optional argument to the Bio.AlignIO.read() function:
45
46 >>> from Bio import AlignIO
47 >>> from Bio.Alphabet import generic_rna
48 >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm",
49 ... alphabet=generic_rna)
50 >>> print align
51 RNAAlphabet() alignment with 2 rows and 104 columns
52 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
53 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
54
55 In addition to the sequences themselves, this example alignment also includes
56 some GR lines for the secondary structure of the sequences. These are
57 strings, with one character for each letter in the associated sequence:
58
59 >>> for record in align:
60 ... print record.id
61 ... print record.seq
62 ... print record.letter_annotations['secondary_structure']
63 AP001509.1
64 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
65 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
66 AE007476.1
67 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
68 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
69
70 Any general annotation for each row is recorded in the SeqRecord's annotations
71 dictionary. You can output this alignment in many different file formats
72 using Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method:
73
74 >>> print align.format("fasta")
75 >AP001509.1
76 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
77 GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
78 >AE007476.1
79 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
80 GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
81 <BLANKLINE>
82
83 Most output formats won't be able to hold the annotation possible in a
84 Stockholm file:
85
86 >>> print align.format("stockholm")
87 # STOCKHOLM 1.0
88 #=GF SQ 2
89 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
90 #=GS AP001509.1 AC AP001509.1
91 #=GS AP001509.1 DE AP001509.1
92 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
93 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
94 #=GS AE007476.1 AC AE007476.1
95 #=GS AE007476.1 DE AE007476.1
96 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
97 //
98 <BLANKLINE>
99
100 Note that when writing Stockholm files, AlignIO does not break long sequences
101 up and interleave them (as in the input file shown above). The standard
102 allows this simpler layout, and it is more likely to be understood by other
103 tools.
104
105 Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to
106 iterate over the alignment rows as SeqRecord objects - rather than working
107 with Alignnment objects. Again, if you want to you can specify this is RNA:
108
109 >>> from Bio import SeqIO
110 >>> from Bio.Alphabet import generic_rna
111 >>> for record in SeqIO.parse("Stockholm/simple.sth", "stockholm",
112 ... alphabet=generic_rna):
113 ... print record.id
114 ... print record.seq
115 ... print record.letter_annotations['secondary_structure']
116 AP001509.1
117 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
118 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
119 AE007476.1
120 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
121 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
122
123 Remember that if you slice a SeqRecord, the per-letter-annotions like the
124 secondary structure string here, are also sliced:
125
126 >>> sub_record = record[10:20]
127 >>> print sub_record.seq
128 AUCGUUUUAC
129 >>> print sub_record.letter_annotations['secondary_structure']
130 -------<<<
131 """
132 __docformat__ = "epytext en"
133 from Bio.Seq import Seq
134 from Bio.SeqRecord import SeqRecord
135 from Bio.Align import MultipleSeqAlignment
136 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
137
139 """Stockholm/PFAM alignment writer."""
140
141
142
143 pfam_gr_mapping = {"secondary_structure" : "SS",
144 "surface_accessibility" : "SA",
145 "transmembrane" : "TM",
146 "posterior_probability" : "PP",
147 "ligand_binding" : "LI",
148 "active_site" : "AS",
149 "intron" : "IN"}
150
151 pfam_gs_mapping = {"organism" : "OS",
152 "organism_classification" : "OC",
153 "look" : "LO"}
154
156 """Use this to write (another) single alignment to an open file.
157
158 Note that sequences and their annotation are recorded
159 together (rather than having a block of annotation followed
160 by a block of aligned sequences).
161 """
162 count = len(alignment)
163
164 self._length_of_sequences = alignment.get_alignment_length()
165 self._ids_written = []
166
167
168
169
170 if count == 0:
171 raise ValueError("Must have at least one sequence")
172 if self._length_of_sequences == 0:
173 raise ValueError("Non-empty sequences are required")
174
175 self.handle.write("# STOCKHOLM 1.0\n")
176 self.handle.write("#=GF SQ %i\n" % count)
177 for record in alignment:
178 self._write_record(record)
179 self.handle.write("//\n")
180
182 """Write a single SeqRecord to the file"""
183 if self._length_of_sequences != len(record.seq):
184 raise ValueError("Sequences must all be the same length")
185
186
187 seq_name = record.id
188 if record.name is not None:
189 if "accession" in record.annotations:
190 if record.id == record.annotations["accession"]:
191 seq_name = record.name
192
193
194 seq_name = seq_name.replace(" ","_")
195
196 if "start" in record.annotations \
197 and "end" in record.annotations:
198 suffix = "/%s-%s" % (str(record.annotations["start"]),
199 str(record.annotations["end"]))
200 if seq_name[-len(suffix):] != suffix:
201 seq_name = "%s/%s-%s" % (seq_name,
202 str(record.annotations["start"]),
203 str(record.annotations["end"]))
204
205 if seq_name in self._ids_written:
206 raise ValueError("Duplicate record identifier: %s" % seq_name)
207 self._ids_written.append(seq_name)
208 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
209
210
211
212
213
214
215
216
217
218
219
220
221
222 if "accession" in record.annotations:
223 self.handle.write("#=GS %s AC %s\n" \
224 % (seq_name, self.clean(record.annotations["accession"])))
225 elif record.id:
226 self.handle.write("#=GS %s AC %s\n" \
227 % (seq_name, self.clean(record.id)))
228
229
230 if record.description:
231 self.handle.write("#=GS %s DE %s\n" \
232 % (seq_name, self.clean(record.description)))
233
234
235 for xref in record.dbxrefs:
236 self.handle.write("#=GS %s DR %s\n" \
237 % (seq_name, self.clean(xref)))
238
239
240 for key, value in record.annotations.iteritems():
241 if key in self.pfam_gs_mapping:
242 data = self.clean(str(value))
243 if data:
244 self.handle.write("#=GS %s %s %s\n" \
245 % (seq_name,
246 self.clean(self.pfam_gs_mapping[key]),
247 data))
248 else:
249
250
251 pass
252
253
254 for key, value in record.letter_annotations.iteritems():
255 if key in self.pfam_gr_mapping and len(str(value))==len(record.seq):
256 data = self.clean(str(value))
257 if data:
258 self.handle.write("#=GR %s %s %s\n" \
259 % (seq_name,
260 self.clean(self.pfam_gr_mapping[key]),
261 data))
262 else:
263
264
265 pass
266
268 """Loads a Stockholm file from PFAM into MultipleSeqAlignment objects.
269
270 The file may contain multiple concatenated alignments, which are loaded
271 and returned incrementally.
272
273 This parser will detect if the Stockholm file follows the PFAM
274 conventions for sequence specific meta-data (lines starting #=GS
275 and #=GR) and populates the SeqRecord fields accordingly.
276
277 Any annotation which does not follow the PFAM conventions is currently
278 ignored.
279
280 If an accession is provided for an entry in the meta data, IT WILL NOT
281 be used as the record.id (it will be recorded in the record's
282 annotations). This is because some files have (sub) sequences from
283 different parts of the same accession (differentiated by different
284 start-end positions).
285
286 Wrap-around alignments are not supported - each sequences must be on
287 a single line. However, interlaced sequences should work.
288
289 For more information on the file format, please see:
290 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
291 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
292
293 For consistency with BioPerl and EMBOSS we call this the "stockholm"
294 format.
295 """
296
297
298
299 pfam_gr_mapping = {"SS" : "secondary_structure",
300 "SA" : "surface_accessibility",
301 "TM" : "transmembrane",
302 "PP" : "posterior_probability",
303 "LI" : "ligand_binding",
304 "AS" : "active_site",
305 "IN" : "intron"}
306
307 pfam_gs_mapping = {"OS" : "organism",
308 "OC" : "organism_classification",
309 "LO" : "look"}
310
312 try:
313 line = self._header
314 del self._header
315 except AttributeError:
316 line = self.handle.readline()
317 if not line:
318
319 raise StopIteration
320 if not line.strip() == '# STOCKHOLM 1.0':
321 raise ValueError("Did not find STOCKHOLM header")
322
323
324
325
326
327
328
329
330 seqs = {}
331 ids = []
332 gs = {}
333 gr = {}
334 gf = {}
335 passed_end_alignment = False
336 while 1:
337 line = self.handle.readline()
338 if not line: break
339 line = line.strip()
340 if line == '# STOCKHOLM 1.0':
341 self._header = line
342 break
343 elif line == "//":
344
345
346 passed_end_alignment = True
347 elif line == "":
348
349 pass
350 elif line[0] != "#":
351
352
353 assert not passed_end_alignment
354 parts = [x.strip() for x in line.split(" ",1)]
355 if len(parts) != 2:
356
357 raise ValueError("Could not split line into identifier " \
358 + "and sequence:\n" + line)
359 id, seq = parts
360 if id not in ids:
361 ids.append(id)
362 seqs.setdefault(id, '')
363 seqs[id] += seq.replace(".","-")
364 elif len(line) >= 5:
365
366 if line[:5] == "#=GF ":
367
368
369 feature, text = line[5:].strip().split(None,1)
370
371
372 if feature not in gf:
373 gf[feature] = [text]
374 else:
375 gf[feature].append(text)
376 elif line[:5] == '#=GC ':
377
378
379 pass
380 elif line[:5] == '#=GS ':
381
382
383 id, feature, text = line[5:].strip().split(None,2)
384
385
386 if id not in gs:
387 gs[id] = {}
388 if feature not in gs[id]:
389 gs[id][feature] = [text]
390 else:
391 gs[id][feature].append(text)
392 elif line[:5] == "#=GR ":
393
394
395 id, feature, text = line[5:].strip().split(None,2)
396
397
398 if id not in gr:
399 gr[id] = {}
400 if feature not in gr[id]:
401 gr[id][feature] = ""
402 gr[id][feature] += text.strip()
403
404
405
406
407
408
409 assert len(seqs) <= len(ids)
410
411
412
413 self.ids = ids
414 self.sequences = seqs
415 self.seq_annotation = gs
416 self.seq_col_annotation = gr
417
418 if ids and seqs:
419
420 if self.records_per_alignment is not None \
421 and self.records_per_alignment != len(ids):
422 raise ValueError("Found %i records in this alignment, told to expect %i" \
423 % (len(ids), self.records_per_alignment))
424
425 alignment_length = len(seqs.values()[0])
426 records = []
427 for id in ids:
428 seq = seqs[id]
429 if alignment_length != len(seq):
430 raise ValueError("Sequences have different lengths, or repeated identifier")
431 name, start, end = self._identifier_split(id)
432 record = SeqRecord(Seq(seq, self.alphabet),
433 id = id, name = name, description = id,
434 annotations = {"accession":name})
435
436
437 record.annotations["accession"]=name
438
439 if start is not None:
440 record.annotations["start"] = start
441 if end is not None:
442 record.annotations["end"] = end
443
444 self._populate_meta_data(id, record)
445 records.append(record)
446 alignment = MultipleSeqAlignment(records, self.alphabet)
447
448
449
450 alignment._annotations = gr
451
452 return alignment
453 else:
454 raise StopIteration
455
456
458 """Returns (name,start,end) string tuple from an identier."""
459 if identifier.find("/")!=-1:
460 name, start_end = identifier.rsplit("/",1)
461 if start_end.count("-")==1:
462 start, end = map(int, start_end.split("-"))
463 return (name, start, end)
464 return (identifier, None, None)
465
498
530
532 """Run the Bio.SeqIO module's doctests.
533
534 This will try and locate the unit tests directory, and run the doctests
535 from there in order that the relative paths used in the examples work.
536 """
537 import doctest
538 import os
539 if os.path.isdir(os.path.join("..","..","Tests")):
540 print "Runing doctests..."
541 cur_dir = os.path.abspath(os.curdir)
542 os.chdir(os.path.join("..","..","Tests"))
543 assert os.path.isfile("Stockholm/simple.sth")
544 doctest.testmod()
545 os.chdir(cur_dir)
546 del cur_dir
547 print "Done"
548
549 if __name__ == "__main__":
550 _test()
551