1
2
3
4
5
6
7
8 """Provides objects to represent biological sequences with alphabets.
9
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
13 """
14 __docformat__ ="epytext en"
15
16 import string
17 import array
18 import sys
19
20 from Bio import Alphabet
21 from Bio.Alphabet import IUPAC
22 from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
23 from Bio.Data import CodonTable
26 """Makes a python string translation table (PRIVATE).
27
28 Arguments:
29 - complement_mapping - a dictionary such as ambiguous_dna_complement
30 and ambiguous_rna_complement from Data.IUPACData.
31
32 Returns a translation table (a string of length 256) for use with the
33 python string's translate method to use in a (reverse) complement.
34
35 Compatible with lower case and upper case sequences.
36
37 For internal use only.
38 """
39 before = ''.join(complement_mapping.keys())
40 after = ''.join(complement_mapping.values())
41 before = before + before.lower()
42 after = after + after.lower()
43 if sys.version_info[0] == 3 :
44 return str.maketrans(before, after)
45 else:
46 return string.maketrans(before, after)
47
48 _dna_complement_table = _maketrans(ambiguous_dna_complement)
49 _rna_complement_table = _maketrans(ambiguous_rna_complement)
50
51 -class Seq(object):
52 """A read-only sequence object (essentially a string with an alphabet).
53
54 Like normal python strings, our basic sequence object is immutable.
55 This prevents you from doing my_seq[5] = "A" for example, but does allow
56 Seq objects to be used as dictionary keys.
57
58 The Seq object provides a number of string like methods (such as count,
59 find, split and strip), which are alphabet aware where appropriate.
60
61 In addition to the string like sequence, the Seq object has an alphabet
62 property. This is an instance of an Alphabet class from Bio.Alphabet,
63 for example generic DNA, or IUPAC DNA. This describes the type of molecule
64 (e.g. RNA, DNA, protein) and may also indicate the expected symbols
65 (letters).
66
67 The Seq object also provides some biological methods, such as complement,
68 reverse_complement, transcribe, back_transcribe and translate (which are
69 not applicable to sequences with a protein alphabet).
70 """
72 """Create a Seq object.
73
74 Arguments:
75 - seq - Sequence, required (string)
76 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
77
78 You will typically use Bio.SeqIO to read in sequences from files as
79 SeqRecord objects, whose sequence will be exposed as a Seq object via
80 the seq property.
81
82 However, will often want to create your own Seq objects directly:
83
84 >>> from Bio.Seq import Seq
85 >>> from Bio.Alphabet import IUPAC
86 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
87 ... IUPAC.protein)
88 >>> my_seq
89 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
90 >>> print my_seq
91 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
92 >>> my_seq.alphabet
93 IUPACProtein()
94
95 """
96
97 if not isinstance(data, basestring):
98 raise TypeError("The sequence data given to a Seq object should "
99 "be a string (not another Seq object etc)")
100 self._data = data
101 self.alphabet = alphabet
102
103
104
105 @property
107 """Sequence as a string (DEPRECATED).
108
109 This is a read only property provided for backwards compatility with
110 older versions of Biopython (as is the tostring() method). We now
111 encourage you to use str(my_seq) instead of my_seq.data or the method
112 my_seq.tostring().
113
114 In recent releases of Biopython it was possible to change a Seq object
115 by updating its data property, but this triggered a deprecation warning.
116 Now the data property is read only, since Seq objects are meant to be
117 immutable:
118
119 >>> from Bio.Seq import Seq
120 >>> from Bio.Alphabet import generic_dna
121 >>> my_seq = Seq("ACGT", generic_dna)
122 >>> str(my_seq) == my_seq.tostring() == "ACGT"
123 True
124 >>> my_seq.data = "AAAA"
125 Traceback (most recent call last):
126 ...
127 AttributeError: can't set attribute
128 """
129 import warnings
130 import Bio
131 warnings.warn("Accessing the .data attribute is deprecated. Please "
132 "use str(my_seq) or my_seq.tostring() instead of "
133 "my_seq.data.", Bio.BiopythonDeprecationWarning)
134 return str(self)
135
137 """Returns a (truncated) representation of the sequence for debugging."""
138 if len(self) > 60:
139
140
141
142 return "%s('%s...%s', %s)" % (self.__class__.__name__,
143 str(self)[:54], str(self)[-3:],
144 repr(self.alphabet))
145 else:
146 return "%s(%s, %s)" % (self.__class__.__name__,
147 repr(self._data),
148 repr(self.alphabet))
150 """Returns the full sequence as a python string, use str(my_seq).
151
152 Note that Biopython 1.44 and earlier would give a truncated
153 version of repr(my_seq) for str(my_seq). If you are writing code
154 which need to be backwards compatible with old Biopython, you
155 should continue to use my_seq.tostring() rather than str(my_seq).
156 """
157 return self._data
158
160 """Hash for comparison.
161
162 See the __cmp__ documentation - we plan to change this!
163 """
164 return id(self)
165
167 """Compare the sequence to another sequence or a string (README).
168
169 Historically comparing Seq objects has done Python object comparison.
170 After considerable discussion (keeping in mind constraints of the
171 Python language, hashes and dictionary support) a future release of
172 Biopython will change this to use simple string comparison. The plan is
173 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a
174 warning.
175
176 This version of Biopython still does Python object comparison, but with
177 a warning about this future change. During this transition period,
178 please just do explicit comparisons:
179
180 >>> seq1 = Seq("ACGT")
181 >>> seq2 = Seq("ACGT")
182 >>> id(seq1) == id(seq2)
183 False
184 >>> str(seq1) == str(seq2)
185 True
186
187 Note - This method indirectly supports ==, < , etc.
188 """
189 if hasattr(other, "alphabet"):
190
191 import warnings
192 warnings.warn("In future comparing Seq objects will use string "
193 "comparison (not object comparison). Incompatible "
194 "alphabets will trigger a warning (not an exception). "
195 "In the interim please use id(seq1)==id(seq2) or "
196 "str(seq1)==str(seq2) to make your code explicit "
197 "and to avoid this warning.", FutureWarning)
198 return cmp(id(self), id(other))
199
201 """Returns the length of the sequence, use len(my_seq)."""
202 return len(self._data)
203
205 """Returns a subsequence of single letter, use my_seq[index]."""
206
207
208
209 if isinstance(index, int):
210
211 return self._data[index]
212 else:
213
214 return Seq(self._data[index], self.alphabet)
215
217 """Add another sequence or string to this sequence.
218
219 If adding a string to a Seq, the alphabet is preserved:
220
221 >>> from Bio.Seq import Seq
222 >>> from Bio.Alphabet import generic_protein
223 >>> Seq("MELKI", generic_protein) + "LV"
224 Seq('MELKILV', ProteinAlphabet())
225
226 When adding two Seq (like) objects, the alphabets are important.
227 Consider this example:
228
229 >>> from Bio.Seq import Seq
230 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
231 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna)
232 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna)
233 >>> unamb_dna_seq
234 Seq('ACGT', IUPACUnambiguousDNA())
235 >>> ambig_dna_seq
236 Seq('ACRGT', IUPACAmbiguousDNA())
237
238 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get
239 the more general ambiguous IUPAC DNA alphabet:
240
241 >>> unamb_dna_seq + ambig_dna_seq
242 Seq('ACGTACRGT', IUPACAmbiguousDNA())
243
244 However, if the default generic alphabet is included, the result is
245 a generic alphabet:
246
247 >>> Seq("") + ambig_dna_seq
248 Seq('ACRGT', Alphabet())
249
250 You can't add RNA and DNA sequences:
251
252 >>> from Bio.Alphabet import generic_dna, generic_rna
253 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna)
254 Traceback (most recent call last):
255 ...
256 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet()
257
258 You can't add nucleotide and protein sequences:
259
260 >>> from Bio.Alphabet import generic_dna, generic_protein
261 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein)
262 Traceback (most recent call last):
263 ...
264 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet()
265 """
266 if hasattr(other, "alphabet"):
267
268 if not Alphabet._check_type_compatible([self.alphabet,
269 other.alphabet]):
270 raise TypeError("Incompatible alphabets %s and %s" \
271 % (repr(self.alphabet), repr(other.alphabet)))
272
273 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
274 return self.__class__(str(self) + str(other), a)
275 elif isinstance(other, basestring):
276
277 return self.__class__(str(self) + other, self.alphabet)
278 from Bio.SeqRecord import SeqRecord
279 if isinstance(other, SeqRecord):
280
281 return NotImplemented
282 else :
283 raise TypeError
284
286 """Adding a sequence on the left.
287
288 If adding a string to a Seq, the alphabet is preserved:
289
290 >>> from Bio.Seq import Seq
291 >>> from Bio.Alphabet import generic_protein
292 >>> "LV" + Seq("MELKI", generic_protein)
293 Seq('LVMELKI', ProteinAlphabet())
294
295 Adding two Seq (like) objects is handled via the __add__ method.
296 """
297 if hasattr(other, "alphabet"):
298
299 if not Alphabet._check_type_compatible([self.alphabet,
300 other.alphabet]):
301 raise TypeError("Incompatable alphabets %s and %s" \
302 % (repr(self.alphabet), repr(other.alphabet)))
303
304 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
305 return self.__class__(str(other) + str(self), a)
306 elif isinstance(other, basestring):
307
308 return self.__class__(other + str(self), self.alphabet)
309 else:
310 raise TypeError
311
313 """Returns the full sequence as a python string (semi-obsolete).
314
315 Although not formally deprecated, you are now encouraged to use
316 str(my_seq) instead of my_seq.tostring()."""
317
318
319
320
321
322
323 return str(self)
324
326 """Returns the full sequence as a MutableSeq object.
327
328 >>> from Bio.Seq import Seq
329 >>> from Bio.Alphabet import IUPAC
330 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
331 ... IUPAC.protein)
332 >>> my_seq
333 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
334 >>> my_seq.tomutable()
335 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
336
337 Note that the alphabet is preserved.
338 """
339 return MutableSeq(str(self), self.alphabet)
340
342 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
343
344 For a string argument, returns the string.
345
346 For a Seq or MutableSeq, it checks the alphabet is compatible
347 (raising an exception if it isn't), and then returns a string.
348 """
349 try:
350 other_alpha = other_sequence.alphabet
351 except AttributeError:
352
353 return other_sequence
354
355
356 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]):
357 raise TypeError("Incompatable alphabets %s and %s" \
358 % (repr(self.alphabet), repr(other_alpha)))
359
360 return str(other_sequence)
361
363 """Non-overlapping count method, like that of a python string.
364
365 This behaves like the python string method of the same name,
366 which does a non-overlapping count!
367
368 Returns an integer, the number of occurrences of substring
369 argument sub in the (sub)sequence given by [start:end].
370 Optional arguments start and end are interpreted as in slice
371 notation.
372
373 Arguments:
374 - sub - a string or another Seq object to look for
375 - start - optional integer, slice start
376 - end - optional integer, slice end
377
378 e.g.
379
380 >>> from Bio.Seq import Seq
381 >>> my_seq = Seq("AAAATGA")
382 >>> print my_seq.count("A")
383 5
384 >>> print my_seq.count("ATG")
385 1
386 >>> print my_seq.count(Seq("AT"))
387 1
388 >>> print my_seq.count("AT", 2, -1)
389 1
390
391 HOWEVER, please note because python strings and Seq objects (and
392 MutableSeq objects) do a non-overlapping search, this may not give
393 the answer you expect:
394
395 >>> "AAAA".count("AA")
396 2
397 >>> print Seq("AAAA").count("AA")
398 2
399
400 A non-overlapping search would give the answer as three!
401 """
402
403 sub_str = self._get_seq_str_and_check_alphabet(sub)
404 return str(self).count(sub_str, start, end)
405
407 """Implements the 'in' keyword, like a python string.
408
409 e.g.
410
411 >>> from Bio.Seq import Seq
412 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
413 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna)
414 >>> "AAA" in my_dna
415 True
416 >>> Seq("AAA") in my_dna
417 True
418 >>> Seq("AAA", generic_dna) in my_dna
419 True
420
421 Like other Seq methods, this will raise a type error if another Seq
422 (or Seq like) object with an incompatible alphabet is used:
423
424 >>> Seq("AAA", generic_rna) in my_dna
425 Traceback (most recent call last):
426 ...
427 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
428 >>> Seq("AAA", generic_protein) in my_dna
429 Traceback (most recent call last):
430 ...
431 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
432 """
433
434 sub_str = self._get_seq_str_and_check_alphabet(char)
435 return sub_str in str(self)
436
438 """Find method, like that of a python string.
439
440 This behaves like the python string method of the same name.
441
442 Returns an integer, the index of the first occurrence of substring
443 argument sub in the (sub)sequence given by [start:end].
444
445 Arguments:
446 - sub - a string or another Seq object to look for
447 - start - optional integer, slice start
448 - end - optional integer, slice end
449
450 Returns -1 if the subsequence is NOT found.
451
452 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
453
454 >>> from Bio.Seq import Seq
455 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
456 >>> my_rna.find("AUG")
457 3
458 """
459
460 sub_str = self._get_seq_str_and_check_alphabet(sub)
461 return str(self).find(sub_str, start, end)
462
464 """Find from right method, like that of a python string.
465
466 This behaves like the python string method of the same name.
467
468 Returns an integer, the index of the last (right most) occurrence of
469 substring argument sub in the (sub)sequence given by [start:end].
470
471 Arguments:
472 - sub - a string or another Seq object to look for
473 - start - optional integer, slice start
474 - end - optional integer, slice end
475
476 Returns -1 if the subsequence is NOT found.
477
478 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
479
480 >>> from Bio.Seq import Seq
481 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
482 >>> my_rna.rfind("AUG")
483 15
484 """
485
486 sub_str = self._get_seq_str_and_check_alphabet(sub)
487 return str(self).rfind(sub_str, start, end)
488
490 """Does the Seq start with the given prefix? Returns True/False.
491
492 This behaves like the python string method of the same name.
493
494 Return True if the sequence starts with the specified prefix
495 (a string or another Seq object), False otherwise.
496 With optional start, test sequence beginning at that position.
497 With optional end, stop comparing sequence at that position.
498 prefix can also be a tuple of strings to try. e.g.
499
500 >>> from Bio.Seq import Seq
501 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
502 >>> my_rna.startswith("GUC")
503 True
504 >>> my_rna.startswith("AUG")
505 False
506 >>> my_rna.startswith("AUG", 3)
507 True
508 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
509 True
510 """
511
512 if isinstance(prefix, tuple):
513
514
515
516 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
517 for p in prefix]
518 for prefix_str in prefix_strings:
519 if str(self).startswith(prefix_str, start, end):
520 return True
521 return False
522 else:
523 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
524 return str(self).startswith(prefix_str, start, end)
525
527 """Does the Seq end with the given suffix? Returns True/False.
528
529 This behaves like the python string method of the same name.
530
531 Return True if the sequence ends with the specified suffix
532 (a string or another Seq object), False otherwise.
533 With optional start, test sequence beginning at that position.
534 With optional end, stop comparing sequence at that position.
535 suffix can also be a tuple of strings to try. e.g.
536
537 >>> from Bio.Seq import Seq
538 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
539 >>> my_rna.endswith("UUG")
540 True
541 >>> my_rna.endswith("AUG")
542 False
543 >>> my_rna.endswith("AUG", 0, 18)
544 True
545 >>> my_rna.endswith(("UCC","UCA","UUG"))
546 True
547 """
548
549 if isinstance(suffix, tuple):
550
551
552
553 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
554 for p in suffix]
555 for suffix_str in suffix_strings:
556 if str(self).endswith(suffix_str, start, end):
557 return True
558 return False
559 else:
560 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
561 return str(self).endswith(suffix_str, start, end)
562
563
564 - def split(self, sep=None, maxsplit=-1):
565 """Split method, like that of a python string.
566
567 This behaves like the python string method of the same name.
568
569 Return a list of the 'words' in the string (as Seq objects),
570 using sep as the delimiter string. If maxsplit is given, at
571 most maxsplit splits are done. If maxsplit is ommited, all
572 splits are made.
573
574 Following the python string method, sep will by default be any
575 white space (tabs, spaces, newlines) but this is unlikely to
576 apply to biological sequences.
577
578 e.g.
579
580 >>> from Bio.Seq import Seq
581 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
582 >>> my_aa = my_rna.translate()
583 >>> my_aa
584 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
585 >>> my_aa.split("*")
586 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
587 >>> my_aa.split("*",1)
588 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
589
590 See also the rsplit method:
591
592 >>> my_aa.rsplit("*",1)
593 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
594 """
595
596 sep_str = self._get_seq_str_and_check_alphabet(sep)
597
598
599 return [Seq(part, self.alphabet) \
600 for part in str(self).split(sep_str, maxsplit)]
601
602 - def rsplit(self, sep=None, maxsplit=-1):
603 """Right split method, like that of a python string.
604
605 This behaves like the python string method of the same name.
606
607 Return a list of the 'words' in the string (as Seq objects),
608 using sep as the delimiter string. If maxsplit is given, at
609 most maxsplit splits are done COUNTING FROM THE RIGHT.
610 If maxsplit is ommited, all splits are made.
611
612 Following the python string method, sep will by default be any
613 white space (tabs, spaces, newlines) but this is unlikely to
614 apply to biological sequences.
615
616 e.g. print my_seq.rsplit("*",1)
617
618 See also the split method.
619 """
620
621 sep_str = self._get_seq_str_and_check_alphabet(sep)
622 return [Seq(part, self.alphabet) \
623 for part in str(self).rsplit(sep_str, maxsplit)]
624
625 - def strip(self, chars=None):
626 """Returns a new Seq object with leading and trailing ends stripped.
627
628 This behaves like the python string method of the same name.
629
630 Optional argument chars defines which characters to remove. If
631 ommitted or None (default) then as for the python string method,
632 this defaults to removing any white space.
633
634 e.g. print my_seq.strip("-")
635
636 See also the lstrip and rstrip methods.
637 """
638
639 strip_str = self._get_seq_str_and_check_alphabet(chars)
640 return Seq(str(self).strip(strip_str), self.alphabet)
641
642 - def lstrip(self, chars=None):
643 """Returns a new Seq object with leading (left) end stripped.
644
645 This behaves like the python string method of the same name.
646
647 Optional argument chars defines which characters to remove. If
648 ommitted or None (default) then as for the python string method,
649 this defaults to removing any white space.
650
651 e.g. print my_seq.lstrip("-")
652
653 See also the strip and rstrip methods.
654 """
655
656 strip_str = self._get_seq_str_and_check_alphabet(chars)
657 return Seq(str(self).lstrip(strip_str), self.alphabet)
658
659 - def rstrip(self, chars=None):
660 """Returns a new Seq object with trailing (right) end stripped.
661
662 This behaves like the python string method of the same name.
663
664 Optional argument chars defines which characters to remove. If
665 ommitted or None (default) then as for the python string method,
666 this defaults to removing any white space.
667
668 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
669
670 >>> from Bio.Alphabet import IUPAC
671 >>> from Bio.Seq import Seq
672 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
673 >>> my_seq
674 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
675 >>> my_seq.rstrip("A")
676 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
677
678 See also the strip and lstrip methods.
679 """
680
681 strip_str = self._get_seq_str_and_check_alphabet(chars)
682 return Seq(str(self).rstrip(strip_str), self.alphabet)
683
685 """Returns an upper case copy of the sequence.
686
687 >>> from Bio.Alphabet import HasStopCodon, generic_protein
688 >>> from Bio.Seq import Seq
689 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein))
690 >>> my_seq
691 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*'))
692 >>> my_seq.lower()
693 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*'))
694 >>> my_seq.upper()
695 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*'))
696
697 This will adjust the alphabet if required. See also the lower method.
698 """
699 return Seq(str(self).upper(), self.alphabet._upper())
700
702 """Returns a lower case copy of the sequence.
703
704 This will adjust the alphabet if required. Note that the IUPAC alphabets
705 are upper case only, and thus a generic alphabet must be substituted.
706
707 >>> from Bio.Alphabet import Gapped, generic_dna
708 >>> from Bio.Alphabet import IUPAC
709 >>> from Bio.Seq import Seq
710 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*"))
711 >>> my_seq
712 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*'))
713 >>> my_seq.lower()
714 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*'))
715
716 See also the upper method.
717 """
718 return Seq(str(self).lower(), self.alphabet._lower())
719
721 """Returns the complement sequence. New Seq object.
722
723 >>> from Bio.Seq import Seq
724 >>> from Bio.Alphabet import IUPAC
725 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
726 >>> my_dna
727 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
728 >>> my_dna.complement()
729 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
730
731 You can of course used mixed case sequences,
732
733 >>> from Bio.Seq import Seq
734 >>> from Bio.Alphabet import generic_dna
735 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
736 >>> my_dna
737 Seq('CCCCCgatA-GD', DNAAlphabet())
738 >>> my_dna.complement()
739 Seq('GGGGGctaT-CH', DNAAlphabet())
740
741 Note in the above example, ambiguous character D denotes
742 G, A or T so its complement is H (for C, T or A).
743
744 Trying to complement a protein sequence raises an exception.
745
746 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
747 >>> my_protein.complement()
748 Traceback (most recent call last):
749 ...
750 ValueError: Proteins do not have complements!
751 """
752 base = Alphabet._get_base_alphabet(self.alphabet)
753 if isinstance(base, Alphabet.ProteinAlphabet):
754 raise ValueError("Proteins do not have complements!")
755 if isinstance(base, Alphabet.DNAAlphabet):
756 ttable = _dna_complement_table
757 elif isinstance(base, Alphabet.RNAAlphabet):
758 ttable = _rna_complement_table
759 elif ('U' in self._data or 'u' in self._data) \
760 and ('T' in self._data or 't' in self._data):
761
762 raise ValueError("Mixed RNA/DNA found")
763 elif 'U' in self._data or 'u' in self._data:
764 ttable = _rna_complement_table
765 else:
766 ttable = _dna_complement_table
767
768
769 return Seq(str(self).translate(ttable), self.alphabet)
770
772 """Returns the reverse complement sequence. New Seq object.
773
774 >>> from Bio.Seq import Seq
775 >>> from Bio.Alphabet import IUPAC
776 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
777 >>> my_dna
778 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
779 >>> my_dna.reverse_complement()
780 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
781
782 Note in the above example, since R = G or A, its complement
783 is Y (which denotes C or T).
784
785 You can of course used mixed case sequences,
786
787 >>> from Bio.Seq import Seq
788 >>> from Bio.Alphabet import generic_dna
789 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
790 >>> my_dna
791 Seq('CCCCCgatA-G', DNAAlphabet())
792 >>> my_dna.reverse_complement()
793 Seq('C-TatcGGGGG', DNAAlphabet())
794
795 Trying to complement a protein sequence raises an exception:
796
797 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
798 >>> my_protein.reverse_complement()
799 Traceback (most recent call last):
800 ...
801 ValueError: Proteins do not have complements!
802 """
803
804 return self.complement()[::-1]
805
807 """Returns the RNA sequence from a DNA sequence. New Seq object.
808
809 >>> from Bio.Seq import Seq
810 >>> from Bio.Alphabet import IUPAC
811 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
812 ... IUPAC.unambiguous_dna)
813 >>> coding_dna
814 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
815 >>> coding_dna.transcribe()
816 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
817
818 Trying to transcribe a protein or RNA sequence raises an exception:
819
820 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
821 >>> my_protein.transcribe()
822 Traceback (most recent call last):
823 ...
824 ValueError: Proteins cannot be transcribed!
825 """
826 base = Alphabet._get_base_alphabet(self.alphabet)
827 if isinstance(base, Alphabet.ProteinAlphabet):
828 raise ValueError("Proteins cannot be transcribed!")
829 if isinstance(base, Alphabet.RNAAlphabet):
830 raise ValueError("RNA cannot be transcribed!")
831
832 if self.alphabet==IUPAC.unambiguous_dna:
833 alphabet = IUPAC.unambiguous_rna
834 elif self.alphabet==IUPAC.ambiguous_dna:
835 alphabet = IUPAC.ambiguous_rna
836 else:
837 alphabet = Alphabet.generic_rna
838 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
839
841 """Returns the DNA sequence from an RNA sequence. New Seq object.
842
843 >>> from Bio.Seq import Seq
844 >>> from Bio.Alphabet import IUPAC
845 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
846 ... IUPAC.unambiguous_rna)
847 >>> messenger_rna
848 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
849 >>> messenger_rna.back_transcribe()
850 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
851
852 Trying to back-transcribe a protein or DNA sequence raises an
853 exception:
854
855 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
856 >>> my_protein.back_transcribe()
857 Traceback (most recent call last):
858 ...
859 ValueError: Proteins cannot be back transcribed!
860 """
861 base = Alphabet._get_base_alphabet(self.alphabet)
862 if isinstance(base, Alphabet.ProteinAlphabet):
863 raise ValueError("Proteins cannot be back transcribed!")
864 if isinstance(base, Alphabet.DNAAlphabet):
865 raise ValueError("DNA cannot be back transcribed!")
866
867 if self.alphabet==IUPAC.unambiguous_rna:
868 alphabet = IUPAC.unambiguous_dna
869 elif self.alphabet==IUPAC.ambiguous_rna:
870 alphabet = IUPAC.ambiguous_dna
871 else:
872 alphabet = Alphabet.generic_dna
873 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
874
875 - def translate(self, table="Standard", stop_symbol="*", to_stop=False,
876 cds=False):
877 """Turns a nucleotide sequence into a protein sequence. New Seq object.
878
879 This method will translate DNA or RNA sequences, and those with a
880 nucleotide or generic alphabet. Trying to translate a protein
881 sequence raises an exception.
882
883 Arguments:
884 - table - Which codon table to use? This can be either a name
885 (string), an NCBI identifier (integer), or a CodonTable
886 object (useful for non-standard genetic codes). This
887 defaults to the "Standard" table.
888 - stop_symbol - Single character string, what to use for terminators.
889 This defaults to the asterisk, "*".
890 - to_stop - Boolean, defaults to False meaning do a full translation
891 continuing on past any stop codons (translated as the
892 specified stop_symbol). If True, translation is
893 terminated at the first in frame stop codon (and the
894 stop_symbol is not appended to the returned protein
895 sequence).
896 - cds - Boolean, indicates this is a complete CDS. If True,
897 this checks the sequence starts with a valid alternative start
898 codon (which will be translated as methionine, M), that the
899 sequence length is a multiple of three, and that there is a
900 single in frame stop codon at the end (this will be excluded
901 from the protein sequence, regardless of the to_stop option).
902 If these tests fail, an exception is raised.
903
904 e.g. Using the standard table:
905
906 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
907 >>> coding_dna.translate()
908 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
909 >>> coding_dna.translate(stop_symbol="@")
910 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
911 >>> coding_dna.translate(to_stop=True)
912 Seq('VAIVMGR', ExtendedIUPACProtein())
913
914 Now using NCBI table 2, where TGA is not a stop codon:
915
916 >>> coding_dna.translate(table=2)
917 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
918 >>> coding_dna.translate(table=2, to_stop=True)
919 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
920
921 In fact, GTG is an alternative start codon under NCBI table 2, meaning
922 this sequence could be a complete CDS:
923
924 >>> coding_dna.translate(table=2, cds=True)
925 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein())
926
927 It isn't a valid CDS under NCBI table 1, due to both the start codon and
928 also the in frame stop codons:
929
930 >>> coding_dna.translate(table=1, cds=True)
931 Traceback (most recent call last):
932 ...
933 TranslationError: First codon 'GTG' is not a start codon
934
935 If the sequence has no in-frame stop codon, then the to_stop argument
936 has no effect:
937
938 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
939 >>> coding_dna2.translate()
940 Seq('LAIVMGR', ExtendedIUPACProtein())
941 >>> coding_dna2.translate(to_stop=True)
942 Seq('LAIVMGR', ExtendedIUPACProtein())
943
944 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
945 or a stop codon. These are translated as "X". Any invalid codon
946 (e.g. "TA?" or "T-A") will throw a TranslationError.
947
948 NOTE - Does NOT support gapped sequences.
949
950 NOTE - This does NOT behave like the python string's translate
951 method. For that use str(my_seq).translate(...) instead.
952 """
953 if isinstance(table, str) and len(table)==256:
954 raise ValueError("The Seq object translate method DOES NOT take " \
955 + "a 256 character string mapping table like " \
956 + "the python string object's translate method. " \
957 + "Use str(my_seq).translate(...) instead.")
958 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
959 Alphabet.ProteinAlphabet):
960 raise ValueError("Proteins cannot be translated!")
961 try:
962 table_id = int(table)
963 except ValueError:
964
965 if self.alphabet==IUPAC.unambiguous_dna:
966
967 codon_table = CodonTable.unambiguous_dna_by_name[table]
968 elif self.alphabet==IUPAC.unambiguous_rna:
969
970 codon_table = CodonTable.unambiguous_rna_by_name[table]
971 else:
972
973
974
975 codon_table = CodonTable.ambiguous_generic_by_name[table]
976 except (AttributeError, TypeError):
977
978 if isinstance(table, CodonTable.CodonTable):
979 codon_table = table
980 else:
981 raise ValueError('Bad table argument')
982 else:
983
984 if self.alphabet==IUPAC.unambiguous_dna:
985
986 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
987 elif self.alphabet==IUPAC.unambiguous_rna:
988
989 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
990 else:
991
992
993
994 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
995 protein = _translate_str(str(self), codon_table, \
996 stop_symbol, to_stop, cds)
997 if stop_symbol in protein:
998 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
999 stop_symbol = stop_symbol)
1000 else:
1001 alphabet = codon_table.protein_alphabet
1002 return Seq(protein, alphabet)
1003
1004 - def ungap(self, gap=None):
1005 """Return a copy of the sequence without the gap character(s).
1006
1007 The gap character can be specified in two ways - either as an explicit
1008 argument, or via the sequence's alphabet. For example:
1009
1010 >>> from Bio.Seq import Seq
1011 >>> from Bio.Alphabet import generic_dna
1012 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna)
1013 >>> my_dna
1014 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet())
1015 >>> my_dna.ungap("-")
1016 Seq('ATATGAAATTTGAAAA', DNAAlphabet())
1017
1018 If the gap character is not given as an argument, it will be taken from
1019 the sequence's alphabet (if defined). Notice that the returned sequence's
1020 alphabet is adjusted since it no longer requires a gapped alphabet:
1021
1022 >>> from Bio.Seq import Seq
1023 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon
1024 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "=")))
1025 >>> my_pro
1026 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*'))
1027 >>> my_pro.ungap()
1028 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*'))
1029
1030 Or, with a simpler gapped DNA example:
1031
1032 >>> from Bio.Seq import Seq
1033 >>> from Bio.Alphabet import IUPAC, Gapped
1034 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "="))
1035 >>> my_seq
1036 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1037 >>> my_seq.ungap()
1038 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
1039
1040 As long as it is consistent with the alphabet, although it is redundant,
1041 you can still supply the gap character as an argument to this method:
1042
1043 >>> my_seq
1044 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1045 >>> my_seq.ungap("=")
1046 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
1047
1048 However, if the gap character given as the argument disagrees with that
1049 declared in the alphabet, an exception is raised:
1050
1051 >>> my_seq
1052 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1053 >>> my_seq.ungap("-")
1054 Traceback (most recent call last):
1055 ...
1056 ValueError: Gap '-' does not match '=' from alphabet
1057
1058 Finally, if a gap character is not supplied, and the alphabet does not
1059 define one, an exception is raised:
1060
1061 >>> from Bio.Seq import Seq
1062 >>> from Bio.Alphabet import generic_dna
1063 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna)
1064 >>> my_dna
1065 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet())
1066 >>> my_dna.ungap()
1067 Traceback (most recent call last):
1068 ...
1069 ValueError: Gap character not given and not defined in alphabet
1070
1071 """
1072 if hasattr(self.alphabet, "gap_char"):
1073 if not gap:
1074 gap = self.alphabet.gap_char
1075 elif gap != self.alphabet.gap_char:
1076 raise ValueError("Gap %s does not match %s from alphabet" \
1077 % (repr(gap), repr(self.alphabet.gap_char)))
1078 alpha = Alphabet._ungap(self.alphabet)
1079 elif not gap:
1080 raise ValueError("Gap character not given and not defined in alphabet")
1081 else:
1082 alpha = self.alphabet
1083 if len(gap)!=1 or not isinstance(gap, str):
1084 raise ValueError("Unexpected gap character, %s" % repr(gap))
1085 return Seq(str(self).replace(gap, ""), alpha)
1086
1088 """A read-only sequence object of known length but unknown contents.
1089
1090 If you have an unknown sequence, you can represent this with a normal
1091 Seq object, for example:
1092
1093 >>> my_seq = Seq("N"*5)
1094 >>> my_seq
1095 Seq('NNNNN', Alphabet())
1096 >>> len(my_seq)
1097 5
1098 >>> print my_seq
1099 NNNNN
1100
1101 However, this is rather wasteful of memory (especially for large
1102 sequences), which is where this class is most usefull:
1103
1104 >>> unk_five = UnknownSeq(5)
1105 >>> unk_five
1106 UnknownSeq(5, alphabet = Alphabet(), character = '?')
1107 >>> len(unk_five)
1108 5
1109 >>> print(unk_five)
1110 ?????
1111
1112 You can add unknown sequence together, provided their alphabets and
1113 characters are compatible, and get another memory saving UnknownSeq:
1114
1115 >>> unk_four = UnknownSeq(4)
1116 >>> unk_four
1117 UnknownSeq(4, alphabet = Alphabet(), character = '?')
1118 >>> unk_four + unk_five
1119 UnknownSeq(9, alphabet = Alphabet(), character = '?')
1120
1121 If the alphabet or characters don't match up, the addition gives an
1122 ordinary Seq object:
1123
1124 >>> unk_nnnn = UnknownSeq(4, character = "N")
1125 >>> unk_nnnn
1126 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
1127 >>> unk_nnnn + unk_four
1128 Seq('NNNN????', Alphabet())
1129
1130 Combining with a real Seq gives a new Seq object:
1131
1132 >>> known_seq = Seq("ACGT")
1133 >>> unk_four + known_seq
1134 Seq('????ACGT', Alphabet())
1135 >>> known_seq + unk_four
1136 Seq('ACGT????', Alphabet())
1137 """
1139 """Create a new UnknownSeq object.
1140
1141 If character is ommited, it is determed from the alphabet, "N" for
1142 nucleotides, "X" for proteins, and "?" otherwise.
1143 """
1144 self._length = int(length)
1145 if self._length < 0:
1146
1147 raise ValueError("Length must not be negative.")
1148 self.alphabet = alphabet
1149 if character:
1150 if len(character) != 1:
1151 raise ValueError("character argument should be a single letter string.")
1152 self._character = character
1153 else:
1154 base = Alphabet._get_base_alphabet(alphabet)
1155
1156
1157 if isinstance(base, Alphabet.NucleotideAlphabet):
1158 self._character = "N"
1159 elif isinstance(base, Alphabet.ProteinAlphabet):
1160 self._character = "X"
1161 else:
1162 self._character = "?"
1163
1165 """Returns the stated length of the unknown sequence."""
1166 return self._length
1167
1169 """Returns the unknown sequence as full string of the given length."""
1170 return self._character * self._length
1171
1173 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
1174 % (self._length, repr(self.alphabet), repr(self._character))
1175
1177 """Add another sequence or string to this sequence.
1178
1179 Adding two UnknownSeq objects returns another UnknownSeq object
1180 provided the character is the same and the alphabets are compatible.
1181
1182 >>> from Bio.Seq import UnknownSeq
1183 >>> from Bio.Alphabet import generic_protein
1184 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein)
1185 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X')
1186
1187 If the characters differ, an UnknownSeq object cannot be used, so a
1188 Seq object is returned:
1189
1190 >>> from Bio.Seq import UnknownSeq
1191 >>> from Bio.Alphabet import generic_protein
1192 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein,
1193 ... character="x")
1194 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet())
1195
1196 If adding a string to an UnknownSeq, a new Seq is returned with the
1197 same alphabet:
1198
1199 >>> from Bio.Seq import UnknownSeq
1200 >>> from Bio.Alphabet import generic_protein
1201 >>> UnknownSeq(5, generic_protein) + "LV"
1202 Seq('XXXXXLV', ProteinAlphabet())
1203 """
1204 if isinstance(other, UnknownSeq) \
1205 and other._character == self._character:
1206
1207 return UnknownSeq(len(self)+len(other),
1208 self.alphabet, self._character)
1209
1210 return Seq(str(self), self.alphabet) + other
1211
1213
1214
1215 return other + Seq(str(self), self.alphabet)
1216
1218 """Get a subsequence from the UnknownSeq object.
1219
1220 >>> unk = UnknownSeq(8, character="N")
1221 >>> print unk[:]
1222 NNNNNNNN
1223 >>> print unk[5:3]
1224 <BLANKLINE>
1225 >>> print unk[1:-1]
1226 NNNNNN
1227 >>> print unk[1:-1:2]
1228 NNN
1229 """
1230 if isinstance(index, int):
1231
1232 return str(self)[index]
1233 old_length = self._length
1234 step = index.step
1235 if step is None or step == 1:
1236
1237 start = index.start
1238 end = index.stop
1239 if start is None:
1240 start = 0
1241 elif start < 0:
1242 start = max(0, old_length + start)
1243 elif start > old_length:
1244 start = old_length
1245 if end is None:
1246 end = old_length
1247 elif end < 0:
1248 end = max(0, old_length + end)
1249 elif end > old_length:
1250 end = old_length
1251 new_length = max(0, end-start)
1252 elif step == 0:
1253 raise ValueError("slice step cannot be zero")
1254 else:
1255
1256 new_length = len(("X"*old_length)[index])
1257
1258
1259
1260 return UnknownSeq(new_length, self.alphabet, self._character)
1261
1263 """Non-overlapping count method, like that of a python string.
1264
1265 This behaves like the python string (and Seq object) method of the
1266 same name, which does a non-overlapping count!
1267
1268 Returns an integer, the number of occurrences of substring
1269 argument sub in the (sub)sequence given by [start:end].
1270 Optional arguments start and end are interpreted as in slice
1271 notation.
1272
1273 Arguments:
1274 - sub - a string or another Seq object to look for
1275 - start - optional integer, slice start
1276 - end - optional integer, slice end
1277
1278 >>> "NNNN".count("N")
1279 4
1280 >>> Seq("NNNN").count("N")
1281 4
1282 >>> UnknownSeq(4, character="N").count("N")
1283 4
1284 >>> UnknownSeq(4, character="N").count("A")
1285 0
1286 >>> UnknownSeq(4, character="N").count("AA")
1287 0
1288
1289 HOWEVER, please note because that python strings and Seq objects (and
1290 MutableSeq objects) do a non-overlapping search, this may not give
1291 the answer you expect:
1292
1293 >>> UnknownSeq(4, character="N").count("NN")
1294 2
1295 >>> UnknownSeq(4, character="N").count("NNN")
1296 1
1297 """
1298 sub_str = self._get_seq_str_and_check_alphabet(sub)
1299 if len(sub_str) == 1:
1300 if str(sub_str) == self._character:
1301 if start==0 and end >= self._length:
1302 return self._length
1303 else:
1304
1305 return str(self).count(sub_str, start, end)
1306 else:
1307 return 0
1308 else:
1309 if set(sub_str) == set(self._character):
1310 if start==0 and end >= self._length:
1311 return self._length // len(sub_str)
1312 else:
1313
1314 return str(self).count(sub_str, start, end)
1315 else:
1316 return 0
1317
1319 """The complement of an unknown nucleotide equals itself.
1320
1321 >>> my_nuc = UnknownSeq(8)
1322 >>> my_nuc
1323 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1324 >>> print my_nuc
1325 ????????
1326 >>> my_nuc.complement()
1327 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1328 >>> print my_nuc.complement()
1329 ????????
1330 """
1331 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1332 Alphabet.ProteinAlphabet):
1333 raise ValueError("Proteins do not have complements!")
1334 return self
1335
1337 """The reverse complement of an unknown nucleotide equals itself.
1338
1339 >>> my_nuc = UnknownSeq(10)
1340 >>> my_nuc
1341 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1342 >>> print my_nuc
1343 ??????????
1344 >>> my_nuc.reverse_complement()
1345 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1346 >>> print my_nuc.reverse_complement()
1347 ??????????
1348 """
1349 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1350 Alphabet.ProteinAlphabet):
1351 raise ValueError("Proteins do not have complements!")
1352 return self
1353
1355 """Returns unknown RNA sequence from an unknown DNA sequence.
1356
1357 >>> my_dna = UnknownSeq(10, character="N")
1358 >>> my_dna
1359 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1360 >>> print my_dna
1361 NNNNNNNNNN
1362 >>> my_rna = my_dna.transcribe()
1363 >>> my_rna
1364 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1365 >>> print my_rna
1366 NNNNNNNNNN
1367 """
1368
1369 s = Seq(self._character, self.alphabet).transcribe()
1370 return UnknownSeq(self._length, s.alphabet, self._character)
1371
1373 """Returns unknown DNA sequence from an unknown RNA sequence.
1374
1375 >>> my_rna = UnknownSeq(20, character="N")
1376 >>> my_rna
1377 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1378 >>> print my_rna
1379 NNNNNNNNNNNNNNNNNNNN
1380 >>> my_dna = my_rna.back_transcribe()
1381 >>> my_dna
1382 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1383 >>> print my_dna
1384 NNNNNNNNNNNNNNNNNNNN
1385 """
1386
1387 s = Seq(self._character, self.alphabet).back_transcribe()
1388 return UnknownSeq(self._length, s.alphabet, self._character)
1389
1391 """Returns an upper case copy of the sequence.
1392
1393 >>> from Bio.Alphabet import generic_dna
1394 >>> from Bio.Seq import UnknownSeq
1395 >>> my_seq = UnknownSeq(20, generic_dna, character="n")
1396 >>> my_seq
1397 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n')
1398 >>> print my_seq
1399 nnnnnnnnnnnnnnnnnnnn
1400 >>> my_seq.upper()
1401 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1402 >>> print my_seq.upper()
1403 NNNNNNNNNNNNNNNNNNNN
1404
1405 This will adjust the alphabet if required. See also the lower method.
1406 """
1407 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1408
1410 """Returns a lower case copy of the sequence.
1411
1412 This will adjust the alphabet if required:
1413
1414 >>> from Bio.Alphabet import IUPAC
1415 >>> from Bio.Seq import UnknownSeq
1416 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein)
1417 >>> my_seq
1418 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X')
1419 >>> print my_seq
1420 XXXXXXXXXXXXXXXXXXXX
1421 >>> my_seq.lower()
1422 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x')
1423 >>> print my_seq.lower()
1424 xxxxxxxxxxxxxxxxxxxx
1425
1426 See also the upper method.
1427 """
1428 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1429
1431 """Translate an unknown nucleotide sequence into an unknown protein.
1432
1433 e.g.
1434
1435 >>> my_seq = UnknownSeq(11, character="N")
1436 >>> print my_seq
1437 NNNNNNNNNNN
1438 >>> my_protein = my_seq.translate()
1439 >>> my_protein
1440 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1441 >>> print my_protein
1442 XXX
1443
1444 In comparison, using a normal Seq object:
1445
1446 >>> my_seq = Seq("NNNNNNNNNNN")
1447 >>> print my_seq
1448 NNNNNNNNNNN
1449 >>> my_protein = my_seq.translate()
1450 >>> my_protein
1451 Seq('XXX', ExtendedIUPACProtein())
1452 >>> print my_protein
1453 XXX
1454
1455 """
1456 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1457 Alphabet.ProteinAlphabet):
1458 raise ValueError("Proteins cannot be translated!")
1459 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1460
1461 - def ungap(self, gap=None):
1462 """Return a copy of the sequence without the gap character(s).
1463
1464 The gap character can be specified in two ways - either as an explicit
1465 argument, or via the sequence's alphabet. For example:
1466
1467 >>> from Bio.Seq import UnknownSeq
1468 >>> from Bio.Alphabet import Gapped, generic_dna
1469 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-"))
1470 >>> my_dna
1471 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N')
1472 >>> my_dna.ungap()
1473 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1474 >>> my_dna.ungap("-")
1475 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1476
1477 If the UnknownSeq is using the gap character, then an empty Seq is
1478 returned:
1479
1480 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-")
1481 >>> my_gap
1482 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-')
1483 >>> my_gap.ungap()
1484 Seq('', DNAAlphabet())
1485 >>> my_gap.ungap("-")
1486 Seq('', DNAAlphabet())
1487
1488 Notice that the returned sequence's alphabet is adjusted to remove any
1489 explicit gap character declaration.
1490 """
1491
1492 s = Seq(self._character, self.alphabet).ungap()
1493 if s :
1494 return UnknownSeq(self._length, s.alphabet, self._character)
1495 else :
1496 return Seq("", s.alphabet)
1497
1499 """An editable sequence object (with an alphabet).
1500
1501 Unlike normal python strings and our basic sequence object (the Seq class)
1502 which are immuatable, the MutableSeq lets you edit the sequence in place.
1503 However, this means you cannot use a MutableSeq object as a dictionary key.
1504
1505 >>> from Bio.Seq import MutableSeq
1506 >>> from Bio.Alphabet import generic_dna
1507 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1508 >>> my_seq
1509 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1510 >>> my_seq[5]
1511 'T'
1512 >>> my_seq[5] = "A"
1513 >>> my_seq
1514 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1515 >>> my_seq[5]
1516 'A'
1517 >>> my_seq[5:8] = "NNN"
1518 >>> my_seq
1519 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1520 >>> len(my_seq)
1521 11
1522
1523 Note that the MutableSeq object does not support as many string-like
1524 or biological methods as the Seq object.
1525 """
1527 if sys.version_info[0] == 3:
1528 self.array_indicator = "u"
1529 else:
1530 self.array_indicator = "c"
1531 if isinstance(data, str):
1532 self.data = array.array(self.array_indicator, data)
1533 else:
1534 self.data = data
1535 self.alphabet = alphabet
1536
1538 """Returns a (truncated) representation of the sequence for debugging."""
1539 if len(self) > 60:
1540
1541
1542
1543 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1544 str(self[:54]), str(self[-3:]),
1545 repr(self.alphabet))
1546 else:
1547 return "%s('%s', %s)" % (self.__class__.__name__,
1548 str(self),
1549 repr(self.alphabet))
1550
1552 """Returns the full sequence as a python string.
1553
1554 Note that Biopython 1.44 and earlier would give a truncated
1555 version of repr(my_seq) for str(my_seq). If you are writing code
1556 which needs to be backwards compatible with old Biopython, you
1557 should continue to use my_seq.tostring() rather than str(my_seq).
1558 """
1559
1560 return "".join(self.data)
1561
1563 """Compare the sequence to another sequence or a string (README).
1564
1565 Currently if compared to another sequence the alphabets must be
1566 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise
1567 an exception. Otherwise only the sequence itself is compared, not the
1568 precise alphabet.
1569
1570 A future release of Biopython will change this (and the Seq object etc)
1571 to use simple string comparison. The plan is that comparing sequences
1572 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning
1573 but not an exception.
1574
1575 During this transition period, please just do explicit comparisons:
1576
1577 >>> seq1 = MutableSeq("ACGT")
1578 >>> seq2 = MutableSeq("ACGT")
1579 >>> id(seq1) == id(seq2)
1580 False
1581 >>> str(seq1) == str(seq2)
1582 True
1583
1584 This method indirectly supports ==, < , etc.
1585 """
1586 if hasattr(other, "alphabet"):
1587
1588 import warnings
1589 warnings.warn("In future comparing incompatible alphabets will "
1590 "only trigger a warning (not an exception). In "
1591 "the interim please use id(seq1)==id(seq2) or "
1592 "str(seq1)==str(seq2) to make your code explicit "
1593 "and to avoid this warning.", FutureWarning)
1594 if not Alphabet._check_type_compatible([self.alphabet,
1595 other.alphabet]):
1596 raise TypeError("Incompatable alphabets %s and %s" \
1597 % (repr(self.alphabet), repr(other.alphabet)))
1598
1599 if isinstance(other, MutableSeq):
1600
1601
1602 return cmp(self.data, other.data)
1603 else:
1604 return cmp(str(self), str(other))
1605 elif isinstance(other, basestring):
1606 return cmp(str(self), other)
1607 else:
1608 raise TypeError
1609
1611
1622
1639
1641
1642
1643
1644
1645
1646 del self.data[index]
1647
1649 """Add another sequence or string to this sequence.
1650
1651 Returns a new MutableSeq object."""
1652 if hasattr(other, "alphabet"):
1653
1654 if not Alphabet._check_type_compatible([self.alphabet,
1655 other.alphabet]):
1656 raise TypeError("Incompatable alphabets %s and %s" \
1657 % (repr(self.alphabet), repr(other.alphabet)))
1658
1659 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1660 if isinstance(other, MutableSeq):
1661
1662
1663 return self.__class__(self.data + other.data, a)
1664 else:
1665 return self.__class__(str(self) + str(other), a)
1666 elif isinstance(other, basestring):
1667
1668 return self.__class__(str(self) + str(other), self.alphabet)
1669 else:
1670 raise TypeError
1671
1673 if hasattr(other, "alphabet"):
1674
1675 if not Alphabet._check_type_compatible([self.alphabet,
1676 other.alphabet]):
1677 raise TypeError("Incompatable alphabets %s and %s" \
1678 % (repr(self.alphabet), repr(other.alphabet)))
1679
1680 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1681 if isinstance(other, MutableSeq):
1682
1683
1684 return self.__class__(other.data + self.data, a)
1685 else:
1686 return self.__class__(str(other) + str(self), a)
1687 elif isinstance(other, basestring):
1688
1689 return self.__class__(str(other) + str(self), self.alphabet)
1690 else:
1691 raise TypeError
1692
1695
1698
1699 - def pop(self, i = (-1)):
1700 c = self.data[i]
1701 del self.data[i]
1702 return c
1703
1705 for i in range(len(self.data)):
1706 if self.data[i] == item:
1707 del self.data[i]
1708 return
1709 raise ValueError("MutableSeq.remove(x): x not in list")
1710
1712 """Non-overlapping count method, like that of a python string.
1713
1714 This behaves like the python string method of the same name,
1715 which does a non-overlapping count!
1716
1717 Returns an integer, the number of occurrences of substring
1718 argument sub in the (sub)sequence given by [start:end].
1719 Optional arguments start and end are interpreted as in slice
1720 notation.
1721
1722 Arguments:
1723 - sub - a string or another Seq object to look for
1724 - start - optional integer, slice start
1725 - end - optional integer, slice end
1726
1727 e.g.
1728
1729 >>> from Bio.Seq import MutableSeq
1730 >>> my_mseq = MutableSeq("AAAATGA")
1731 >>> print my_mseq.count("A")
1732 5
1733 >>> print my_mseq.count("ATG")
1734 1
1735 >>> print my_mseq.count(Seq("AT"))
1736 1
1737 >>> print my_mseq.count("AT", 2, -1)
1738 1
1739
1740 HOWEVER, please note because that python strings, Seq objects and
1741 MutableSeq objects do a non-overlapping search, this may not give
1742 the answer you expect:
1743
1744 >>> "AAAA".count("AA")
1745 2
1746 >>> print MutableSeq("AAAA").count("AA")
1747 2
1748
1749 A non-overlapping search would give the answer as three!
1750 """
1751 try:
1752
1753 search = sub.tostring()
1754 except AttributeError:
1755 search = sub
1756
1757 if not isinstance(search, basestring):
1758 raise TypeError("expected a string, Seq or MutableSeq")
1759
1760 if len(search) == 1:
1761
1762 count = 0
1763 for c in self.data[start:end]:
1764 if c == search: count += 1
1765 return count
1766 else:
1767
1768 return self.tostring().count(search, start, end)
1769
1771 for i in range(len(self.data)):
1772 if self.data[i] == item:
1773 return i
1774 raise ValueError("MutableSeq.index(x): x not in list")
1775
1777 """Modify the mutable sequence to reverse itself.
1778
1779 No return value.
1780 """
1781 self.data.reverse()
1782
1808
1810 """Modify the mutable sequence to take on its reverse complement.
1811
1812 Trying to reverse complement a protein sequence raises an exception.
1813
1814 No return value.
1815 """
1816 self.complement()
1817 self.data.reverse()
1818
1819
1820
1821
1829
1831 """Returns the full sequence as a python string (semi-obsolete).
1832
1833 Although not formally deprecated, you are now encouraged to use
1834 str(my_seq) instead of my_seq.tostring().
1835
1836 Because str(my_seq) will give you the full sequence as a python string,
1837 there is often no need to make an explicit conversion. For example,
1838
1839 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1840
1841 On Biopython 1.44 or older you would have to have done this:
1842
1843 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1844 """
1845 return "".join(self.data)
1846
1848 """Returns the full sequence as a new immutable Seq object.
1849
1850 >>> from Bio.Seq import Seq
1851 >>> from Bio.Alphabet import IUPAC
1852 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL",
1853 ... IUPAC.protein)
1854 >>> my_mseq
1855 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1856 >>> my_mseq.toseq()
1857 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1858
1859 Note that the alphabet is preserved.
1860 """
1861 return Seq("".join(self.data), self.alphabet)
1862
1868 """Transcribes a DNA sequence into RNA.
1869
1870 If given a string, returns a new string object.
1871
1872 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1873
1874 Trying to transcribe a protein or RNA sequence raises an exception.
1875
1876 e.g.
1877
1878 >>> transcribe("ACTGN")
1879 'ACUGN'
1880 """
1881 if isinstance(dna, Seq):
1882 return dna.transcribe()
1883 elif isinstance(dna, MutableSeq):
1884 return dna.toseq().transcribe()
1885 else:
1886 return dna.replace('T','U').replace('t','u')
1887
1889 """Back-transcribes an RNA sequence into DNA.
1890
1891 If given a string, returns a new string object.
1892
1893 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1894
1895 Trying to transcribe a protein or DNA sequence raises an exception.
1896
1897 e.g.
1898
1899 >>> back_transcribe("ACUGN")
1900 'ACTGN'
1901 """
1902 if isinstance(rna, Seq):
1903 return rna.back_transcribe()
1904 elif isinstance(rna, MutableSeq):
1905 return rna.toseq().back_transcribe()
1906 else:
1907 return rna.replace('U','T').replace('u','t')
1908
1909 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False,
1910 cds=False, pos_stop="X"):
1911 """Helper function to translate a nucleotide string (PRIVATE).
1912
1913 Arguments:
1914 - sequence - a string
1915 - table - a CodonTable object (NOT a table name or id number)
1916 - stop_symbol - a single character string, what to use for terminators.
1917 - to_stop - boolean, should translation terminate at the first
1918 in frame stop codon? If there is no in-frame stop codon
1919 then translation continues to the end.
1920 - pos_stop - a single character string for a possible stop codon
1921 (e.g. TAN or NNN)
1922 - cds - Boolean, indicates this is a complete CDS. If True, this
1923 checks the sequence starts with a valid alternative start
1924 codon (which will be translated as methionine, M), that the
1925 sequence length is a multiple of three, and that there is a
1926 single in frame stop codon at the end (this will be excluded
1927 from the protein sequence, regardless of the to_stop option).
1928 If these tests fail, an exception is raised.
1929
1930 Returns a string.
1931
1932 e.g.
1933
1934 >>> from Bio.Data import CodonTable
1935 >>> table = CodonTable.ambiguous_dna_by_id[1]
1936 >>> _translate_str("AAA", table)
1937 'K'
1938 >>> _translate_str("TAR", table)
1939 '*'
1940 >>> _translate_str("TAN", table)
1941 'X'
1942 >>> _translate_str("TAN", table, pos_stop="@")
1943 '@'
1944 >>> _translate_str("TA?", table)
1945 Traceback (most recent call last):
1946 ...
1947 TranslationError: Codon 'TA?' is invalid
1948 >>> _translate_str("ATGCCCTAG", table, cds=True)
1949 'MP'
1950 >>> _translate_str("AAACCCTAG", table, cds=True)
1951 Traceback (most recent call last):
1952 ...
1953 TranslationError: First codon 'AAA' is not a start codon
1954 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
1955 Traceback (most recent call last):
1956 ...
1957 TranslationError: Extra in frame stop codon found.
1958 """
1959 sequence = sequence.upper()
1960 amino_acids = []
1961 forward_table = table.forward_table
1962 stop_codons = table.stop_codons
1963 if table.nucleotide_alphabet.letters is not None:
1964 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1965 else:
1966
1967 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1968 IUPAC.ambiguous_rna.letters.upper())
1969 if cds:
1970 if str(sequence[:3]).upper() not in table.start_codons:
1971 raise CodonTable.TranslationError(\
1972 "First codon '%s' is not a start codon" % sequence[:3])
1973 if len(sequence) % 3 != 0:
1974 raise CodonTable.TranslationError(\
1975 "Sequence length %i is not a multiple of three" % len(sequence))
1976 if str(sequence[-3:]).upper() not in stop_codons:
1977 raise CodonTable.TranslationError(\
1978 "Final codon '%s' is not a stop codon" % sequence[-3:])
1979
1980 sequence = sequence[3:-3]
1981 amino_acids = ["M"]
1982 n = len(sequence)
1983 for i in xrange(0,n-n%3,3):
1984 codon = sequence[i:i+3]
1985 try:
1986 amino_acids.append(forward_table[codon])
1987 except (KeyError, CodonTable.TranslationError):
1988
1989 if codon in table.stop_codons:
1990 if cds:
1991 raise CodonTable.TranslationError(\
1992 "Extra in frame stop codon found.")
1993 if to_stop : break
1994 amino_acids.append(stop_symbol)
1995 elif valid_letters.issuperset(set(codon)):
1996
1997 amino_acids.append(pos_stop)
1998 else:
1999 raise CodonTable.TranslationError(\
2000 "Codon '%s' is invalid" % codon)
2001 return "".join(amino_acids)
2002
2003 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False,
2004 cds=False):
2005 """Translate a nucleotide sequence into amino acids.
2006
2007 If given a string, returns a new string object. Given a Seq or
2008 MutableSeq, returns a Seq object with a protein alphabet.
2009
2010 Arguments:
2011 - table - Which codon table to use? This can be either a name (string),
2012 an NCBI identifier (integer), or a CodonTable object (useful
2013 for non-standard genetic codes). Defaults to the "Standard"
2014 table.
2015 - stop_symbol - Single character string, what to use for any
2016 terminators, defaults to the asterisk, "*".
2017 - to_stop - Boolean, defaults to False meaning do a full
2018 translation continuing on past any stop codons
2019 (translated as the specified stop_symbol). If
2020 True, translation is terminated at the first in
2021 frame stop codon (and the stop_symbol is not
2022 appended to the returned protein sequence).
2023 - cds - Boolean, indicates this is a complete CDS. If True, this
2024 checks the sequence starts with a valid alternative start
2025 codon (which will be translated as methionine, M), that the
2026 sequence length is a multiple of three, and that there is a
2027 single in frame stop codon at the end (this will be excluded
2028 from the protein sequence, regardless of the to_stop option).
2029 If these tests fail, an exception is raised.
2030
2031 A simple string example using the default (standard) genetic code:
2032
2033 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
2034 >>> translate(coding_dna)
2035 'VAIVMGR*KGAR*'
2036 >>> translate(coding_dna, stop_symbol="@")
2037 'VAIVMGR@KGAR@'
2038 >>> translate(coding_dna, to_stop=True)
2039 'VAIVMGR'
2040
2041 Now using NCBI table 2, where TGA is not a stop codon:
2042
2043 >>> translate(coding_dna, table=2)
2044 'VAIVMGRWKGAR*'
2045 >>> translate(coding_dna, table=2, to_stop=True)
2046 'VAIVMGRWKGAR'
2047
2048 In fact this example uses an alternative start codon valid under NCBI table 2,
2049 GTG, which means this example is a complete valid CDS which when translated
2050 should really start with methionine (not valine):
2051
2052 >>> translate(coding_dna, table=2, cds=True)
2053 'MAIVMGRWKGAR'
2054
2055 Note that if the sequence has no in-frame stop codon, then the to_stop
2056 argument has no effect:
2057
2058 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
2059 >>> translate(coding_dna2)
2060 'VAIVMGR'
2061 >>> translate(coding_dna2, to_stop=True)
2062 'VAIVMGR'
2063
2064 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
2065 or a stop codon. These are translated as "X". Any invalid codon
2066 (e.g. "TA?" or "T-A") will throw a TranslationError.
2067
2068 NOTE - Does NOT support gapped sequences.
2069
2070 It will however translate either DNA or RNA.
2071 """
2072 if isinstance(sequence, Seq):
2073 return sequence.translate(table, stop_symbol, to_stop, cds)
2074 elif isinstance(sequence, MutableSeq):
2075
2076 return sequence.toseq().translate(table, stop_symbol, to_stop, cds)
2077 else:
2078
2079 try:
2080 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
2081 except ValueError:
2082 codon_table = CodonTable.ambiguous_generic_by_name[table]
2083 except (AttributeError, TypeError):
2084 if isinstance(table, CodonTable.CodonTable):
2085 codon_table = table
2086 else:
2087 raise ValueError('Bad table argument')
2088 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2089
2123
2125 """Run the Bio.Seq module's doctests (PRIVATE)."""
2126 if sys.version_info[0:2] == (3,1):
2127 print "Not running Bio.Seq doctest on Python 3.1"
2128 print "See http://bugs.python.org/issue7490"
2129 else:
2130 print "Runing doctests..."
2131 import doctest
2132 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
2133 print "Done"
2134
2135 if __name__ == "__main__":
2136 _test()
2137