1
2
3
4
5
6
7
8 """
9 Contains classes to deal with generic sequence alignment stuff not
10 specific to a particular program or format.
11
12 Classes:
13 - Alignment
14 """
15 __docformat__ = "epytext en"
16
17
18 from Bio.Seq import Seq
19 from Bio.SeqRecord import SeqRecord
20 from Bio import Alphabet
21
23 """Represent a set of alignments (DEPRECATED).
24
25 This is a base class to represent alignments, which can be subclassed
26 to deal with an alignment in a specific format.
27
28 With the introduction of the MultipleSeqAlignment class in Bio.Align,
29 this base class is deprecated and is likely to be removed in future
30 releases of Biopython.
31 """
33 """Initialize a new Alignment object.
34
35 Arguments:
36 - alphabet - The alphabet to use for the sequence objects that are
37 created. This alphabet must be a gapped type.
38
39 e.g.
40
41 >>> from Bio.Alphabet import IUPAC, Gapped
42 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
43 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
44 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
45 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
46 >>> print align
47 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
48 ACTGCTAGCTAG Alpha
49 ACT-CTAGCTAG Beta
50 ACTGCTAGATAG Gamma
51 """
52 import warnings
53 import Bio
54 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
55 if not (isinstance(alphabet, Alphabet.Alphabet) \
56 or isinstance(alphabet, Alphabet.AlphabetEncoder)):
57 raise ValueError("Invalid alphabet argument")
58 self._alphabet = alphabet
59
60 self._records = []
61
63 """Returns a truncated string representation of a SeqRecord (PRIVATE).
64
65 This is a PRIVATE function used by the __str__ method.
66 """
67 if len(record.seq) <= 50:
68 return "%s %s" % (record.seq, record.id)
69 else:
70 return "%s...%s %s" \
71 % (record.seq[:44], record.seq[-3:], record.id)
72
74 """Returns a multi-line string summary of the alignment.
75
76 This output is intended to be readable, but large alignments are
77 shown truncated. A maximum of 20 rows (sequences) and 50 columns
78 are shown, with the record identifiers. This should fit nicely on a
79 single screen. e.g.
80
81 >>> from Bio.Alphabet import IUPAC, Gapped
82 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
83 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
84 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
85 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
86 >>> print align
87 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
88 ACTGCTAGCTAG Alpha
89 ACT-CTAGCTAG Beta
90 ACTGCTAGATAG Gamma
91
92 See also the alignment's format method.
93 """
94 rows = len(self._records)
95 lines = ["%s alignment with %i rows and %i columns" \
96 % (str(self._alphabet), rows, self.get_alignment_length())]
97 if rows <= 20:
98 lines.extend([self._str_line(rec) for rec in self._records])
99 else:
100 lines.extend([self._str_line(rec) for rec in self._records[:18]])
101 lines.append("...")
102 lines.append(self._str_line(self._records[-1]))
103 return "\n".join(lines)
104
106 """Returns a representation of the object for debugging.
107
108 The representation cannot be used with eval() to recreate the object,
109 which is usually possible with simple python ojects. For example:
110
111 <Bio.Align.Generic.Alignment instance (2 records of length 14,
112 SingleLetterAlphabet()) at a3c184c>
113
114 The hex string is the memory address of the object, see help(id).
115 This provides a simple way to visually distinguish alignments of
116 the same size.
117 """
118
119
120 return "<%s instance (%i records of length %i, %s) at %x>" % \
121 (self.__class__, len(self._records),
122 self.get_alignment_length(), repr(self._alphabet), id(self))
123
124
125
126
127
163
164
181
183 """Return all of the sequences involved in the alignment (DEPRECATED).
184
185 The return value is a list of SeqRecord objects.
186
187 This method is deprecated, as the Alignment object itself now offers
188 much of the functionality of a list of SeqRecord objects (e.g.
189 iteration or slicing to create a sub-alignment). Instead use the
190 Python builtin function list, i.e. my_list = list(my_align)
191 """
192 import warnings
193 import Bio
194 warnings.warn("This method is deprecated, since the alignment object"
195 "now acts more like a list. Instead of calling "
196 "align.get_all_seqs() you can use list(align)",
197 Bio.BiopythonDeprecationWarning)
198 return self._records
199
201 """Iterate over alignment rows as SeqRecord objects.
202
203 e.g.
204
205 >>> from Bio.Alphabet import IUPAC, Gapped
206 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
207 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
208 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
209 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
210 >>> for record in align:
211 ... print record.id
212 ... print record.seq
213 Alpha
214 ACTGCTAGCTAG
215 Beta
216 ACT-CTAGCTAG
217 Gamma
218 ACTGCTAGATAG
219 """
220 return iter(self._records)
221
223 """Retrieve a sequence by row number (DEPRECATED).
224
225 Returns:
226 - A Seq object for the requested sequence.
227
228 Raises:
229 - IndexError - If the specified number is out of range.
230
231 NOTE: This is a legacy method. In new code where you need to access
232 the rows of the alignment (i.e. the sequences) consider iterating
233 over them or accessing them as SeqRecord objects.
234 """
235 import warnings
236 import Bio
237 warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning)
238 return self._records[number].seq
239
241 """Returns the number of sequences in the alignment.
242
243 Use len(alignment) to get the number of sequences (i.e. the number of
244 rows), and alignment.get_alignment_length() to get the length of the
245 longest sequence (i.e. the number of columns).
246
247 This is easy to remember if you think of the alignment as being like a
248 list of SeqRecord objects.
249 """
250 return len(self._records)
251
253 """Return the maximum length of the alignment.
254
255 All objects in the alignment should (hopefully) have the same
256 length. This function will go through and find this length
257 by finding the maximum length of sequences in the alignment.
258
259 >>> from Bio.Alphabet import IUPAC, Gapped
260 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
261 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
262 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
263 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
264 >>> align.get_alignment_length()
265 12
266
267 If you want to know the number of sequences in the alignment,
268 use len(align) instead:
269
270 >>> len(align)
271 3
272
273 """
274 max_length = 0
275
276 for record in self._records:
277 if len(record.seq) > max_length:
278 max_length = len(record.seq)
279
280 return max_length
281
282 - def add_sequence(self, descriptor, sequence, start = None, end = None,
283 weight = 1.0):
284 """Add a sequence to the alignment.
285
286 This doesn't do any kind of alignment, it just adds in the sequence
287 object, which is assumed to be prealigned with the existing
288 sequences.
289
290 Arguments:
291 - descriptor - The descriptive id of the sequence being added.
292 This will be used as the resulting SeqRecord's
293 .id property (and, for historical compatibility,
294 also the .description property)
295 - sequence - A string with sequence info.
296 - start - You can explicitly set the start point of the sequence.
297 This is useful (at least) for BLAST alignments, which can
298 just be partial alignments of sequences.
299 - end - Specify the end of the sequence, which is important
300 for the same reason as the start.
301 - weight - The weight to place on the sequence in the alignment.
302 By default, all sequences have the same weight. (0.0 =>
303 no weight, 1.0 => highest weight)
304 """
305 new_seq = Seq(sequence, self._alphabet)
306
307
308
309
310
311
312 new_record = SeqRecord(new_seq,
313 id = descriptor,
314 description = descriptor)
315
316
317
318
319
320
321
322
323 if start:
324 new_record.annotations['start'] = start
325 if end:
326 new_record.annotations['end'] = end
327
328
329 new_record.annotations['weight'] = weight
330
331 self._records.append(new_record)
332
334 """Returns a string containing a given column.
335
336 e.g.
337
338 >>> from Bio.Alphabet import IUPAC, Gapped
339 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
340 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
341 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
342 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
343 >>> align.get_column(0)
344 'AAA'
345 >>> align.get_column(3)
346 'G-G'
347 """
348
349 col_str = ''
350 assert col >= 0 and col <= self.get_alignment_length()
351 for rec in self._records:
352 col_str += rec.seq[col]
353 return col_str
354
356 """Access part of the alignment.
357
358 We'll use the following example alignment here for illustration:
359
360 >>> from Bio.Alphabet import IUPAC, Gapped
361 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
362 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
363 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
364 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
365 >>> align.add_sequence("Delta", "ACTGCTTGCTAG")
366 >>> align.add_sequence("Epsilon","ACTGCTTGATAG")
367
368 You can access a row of the alignment as a SeqRecord using an integer
369 index (think of the alignment as a list of SeqRecord objects here):
370
371 >>> first_record = align[0]
372 >>> print first_record.id, first_record.seq
373 Alpha ACTGCTAGCTAG
374 >>> last_record = align[-1]
375 >>> print last_record.id, last_record.seq
376 Epsilon ACTGCTTGATAG
377
378 You can also access use python's slice notation to create a sub-alignment
379 containing only some of the SeqRecord objects:
380
381 >>> sub_alignment = align[2:5]
382 >>> print sub_alignment
383 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
384 ACTGCTAGATAG Gamma
385 ACTGCTTGCTAG Delta
386 ACTGCTTGATAG Epsilon
387
388 This includes support for a step, i.e. align[start:end:step], which
389 can be used to select every second sequence:
390
391 >>> sub_alignment = align[::2]
392 >>> print sub_alignment
393 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
394 ACTGCTAGCTAG Alpha
395 ACTGCTAGATAG Gamma
396 ACTGCTTGATAG Epsilon
397
398 Or to get a copy of the alignment with the rows in reverse order:
399
400 >>> rev_alignment = align[::-1]
401 >>> print rev_alignment
402 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns
403 ACTGCTTGATAG Epsilon
404 ACTGCTTGCTAG Delta
405 ACTGCTAGATAG Gamma
406 ACT-CTAGCTAG Beta
407 ACTGCTAGCTAG Alpha
408
409 Right now, these are the ONLY indexing operations supported. The use of
410 a second column based index is under discussion for a future update.
411 """
412 if isinstance(index, int):
413
414
415 return self._records[index]
416 elif isinstance(index, slice):
417
418
419
420
421 sub_align = Alignment(self._alphabet)
422 sub_align._records = self._records[index]
423 return sub_align
424 elif len(index)==2:
425 raise TypeError("Row and Column indexing is not currently supported,"\
426 +"but may be in future.")
427 else:
428 raise TypeError("Invalid index type.")
429
431 """Run the Bio.Align.Generic module's doctests."""
432 print "Running doctests..."
433 import doctest
434 doctest.testmod()
435 print "Done"
436
437 if __name__ == "__main__":
438 _test()
439