1
2
3
4
5
6
7
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en"
10
11
12
13
14
15
17 """Dict which only allows sequences of given length as values (PRIVATE).
18
19 This simple subclass of the Python dictionary is used in the SeqRecord
20 object for holding per-letter-annotations. This class is intended to
21 prevent simple errors by only allowing python sequences (e.g. lists,
22 strings and tuples) to be stored, and only if their length matches that
23 expected (the length of the SeqRecord's seq object). It cannot however
24 prevent the entries being edited in situ (for example appending entries
25 to a list).
26
27 >>> x = _RestrictedDict(5)
28 >>> x["test"] = "hello"
29 >>> x
30 {'test': 'hello'}
31
32 Adding entries which don't have the expected length are blocked:
33
34 >>> x["test"] = "hello world"
35 Traceback (most recent call last):
36 ...
37 TypeError: We only allow python sequences (lists, tuples or strings) of length 5.
38
39 The expected length is stored as a private attribute,
40
41 >>> x._length
42 5
43
44 In order that the SeqRecord (and other objects using this class) can be
45 pickled, for example for use in the multiprocessing library, we need to
46 be able to pickle the restricted dictionary objects.
47
48 Using the default protocol, which is 0 on Python 2.x,
49
50 >>> import pickle
51 >>> y = pickle.loads(pickle.dumps(x))
52 >>> y
53 {'test': 'hello'}
54 >>> y._length
55 5
56
57 Using the highest protocol, which is 2 on Python 2.x,
58
59 >>> import pickle
60 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL))
61 >>> z
62 {'test': 'hello'}
63 >>> z._length
64 5
65 """
66
71
73
74
75 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
76 or (hasattr(self, "_length") and len(value) != self._length):
77 raise TypeError("We only allow python sequences (lists, tuples or "
78 "strings) of length %i." % self._length)
79 dict.__setitem__(self, key, value)
80
85
86
88 """A SeqRecord object holds a sequence and information about it.
89
90 Main attributes:
91 - id - Identifier such as a locus tag (string)
92 - seq - The sequence itself (Seq object or similar)
93
94 Additional attributes:
95 - name - Sequence name, e.g. gene name (string)
96 - description - Additional text (string)
97 - dbxrefs - List of database cross references (list of strings)
98 - features - Any (sub)features defined (list of SeqFeature objects)
99 - annotations - Further information about the whole sequence (dictionary)
100 Most entries are strings, or lists of strings.
101 - letter_annotations - Per letter/symbol annotation (restricted
102 dictionary). This holds Python sequences (lists, strings
103 or tuples) whose length matches that of the sequence.
104 A typical use would be to hold a list of integers
105 representing sequencing quality scores, or a string
106 representing the secondary structure.
107
108 You will typically use Bio.SeqIO to read in sequences from files as
109 SeqRecord objects. However, you may want to create your own SeqRecord
110 objects directly (see the __init__ method for further details):
111
112 >>> from Bio.Seq import Seq
113 >>> from Bio.SeqRecord import SeqRecord
114 >>> from Bio.Alphabet import IUPAC
115 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
116 ... IUPAC.protein),
117 ... id="YP_025292.1", name="HokC",
118 ... description="toxic membrane protein")
119 >>> print record
120 ID: YP_025292.1
121 Name: HokC
122 Description: toxic membrane protein
123 Number of features: 0
124 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
125
126 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
127 for this. For the special case where you want the SeqRecord turned into
128 a string in a particular file format there is a format method which uses
129 Bio.SeqIO internally:
130
131 >>> print record.format("fasta")
132 >YP_025292.1 toxic membrane protein
133 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
134 <BLANKLINE>
135
136 You can also do things like slicing a SeqRecord, checking its length, etc
137
138 >>> len(record)
139 44
140 >>> edited = record[:10] + record[11:]
141 >>> print edited.seq
142 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
143 >>> print record.seq
144 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
145
146 """
147 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
148 description = "<unknown description>", dbxrefs = None,
149 features = None, annotations = None,
150 letter_annotations = None):
151 """Create a SeqRecord.
152
153 Arguments:
154 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq)
155 - id - Sequence identifier, recommended (string)
156 - name - Sequence name, optional (string)
157 - description - Sequence description, optional (string)
158 - dbxrefs - Database cross references, optional (list of strings)
159 - features - Any (sub)features, optional (list of SeqFeature objects)
160 - annotations - Dictionary of annotations for the whole sequence
161 - letter_annotations - Dictionary of per-letter-annotations, values
162 should be strings, list or tuples of the same
163 length as the full sequence.
164
165 You will typically use Bio.SeqIO to read in sequences from files as
166 SeqRecord objects. However, you may want to create your own SeqRecord
167 objects directly.
168
169 Note that while an id is optional, we strongly recommend you supply a
170 unique id string for each record. This is especially important
171 if you wish to write your sequences to a file.
172
173 If you don't have the actual sequence, but you do know its length,
174 then using the UnknownSeq object from Bio.Seq is appropriate.
175
176 You can create a 'blank' SeqRecord object, and then populate the
177 attributes later.
178 """
179 if id is not None and not isinstance(id, basestring):
180
181 raise TypeError("id argument should be a string")
182 if not isinstance(name, basestring):
183 raise TypeError("name argument should be a string")
184 if not isinstance(description, basestring):
185 raise TypeError("description argument should be a string")
186 self._seq = seq
187 self.id = id
188 self.name = name
189 self.description = description
190
191
192 if dbxrefs is None:
193 dbxrefs = []
194 elif not isinstance(dbxrefs, list):
195 raise TypeError("dbxrefs argument should be a list (of strings)")
196 self.dbxrefs = dbxrefs
197
198
199 if annotations is None:
200 annotations = {}
201 elif not isinstance(annotations, dict):
202 raise TypeError("annotations argument should be a dict")
203 self.annotations = annotations
204
205 if letter_annotations is None:
206
207 if seq is None:
208
209 self._per_letter_annotations = _RestrictedDict(length=0)
210 else:
211 try:
212 self._per_letter_annotations = \
213 _RestrictedDict(length=len(seq))
214 except:
215 raise TypeError("seq argument should be a Seq object or similar")
216 else:
217
218
219
220 self.letter_annotations = letter_annotations
221
222
223 if features is None:
224 features = []
225 elif not isinstance(features, list):
226 raise TypeError("features argument should be a list (of SeqFeature objects)")
227 self.features = features
228
229
231 if not isinstance(value, dict):
232 raise TypeError("The per-letter-annotations should be a "
233 "(restricted) dictionary.")
234
235 try:
236 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
237 except AttributeError:
238
239 self._per_letter_annotations = _RestrictedDict(length=0)
240 self._per_letter_annotations.update(value)
241 letter_annotations = property( \
242 fget=lambda self : self._per_letter_annotations,
243 fset=_set_per_letter_annotations,
244 doc="""Dictionary of per-letter-annotation for the sequence.
245
246 For example, this can hold quality scores used in FASTQ or QUAL files.
247 Consider this example using Bio.SeqIO to read in an example Solexa
248 variant FASTQ file as a SeqRecord:
249
250 >>> from Bio import SeqIO
251 >>> handle = open("Quality/solexa_faked.fastq", "rU")
252 >>> record = SeqIO.read(handle, "fastq-solexa")
253 >>> handle.close()
254 >>> print record.id, record.seq
255 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
256 >>> print record.letter_annotations.keys()
257 ['solexa_quality']
258 >>> print record.letter_annotations["solexa_quality"]
259 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
260
261 The letter_annotations get sliced automatically if you slice the
262 parent SeqRecord, for example taking the last ten bases:
263
264 >>> sub_record = record[-10:]
265 >>> print sub_record.id, sub_record.seq
266 slxa_0001_1_0001_01 ACGTNNNNNN
267 >>> print sub_record.letter_annotations["solexa_quality"]
268 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
269
270 Any python sequence (i.e. list, tuple or string) can be recorded in
271 the SeqRecord's letter_annotations dictionary as long as the length
272 matches that of the SeqRecord's sequence. e.g.
273
274 >>> len(sub_record.letter_annotations)
275 1
276 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
277 >>> len(sub_record.letter_annotations)
278 2
279
280 You can delete entries from the letter_annotations dictionary as usual:
281
282 >>> del sub_record.letter_annotations["solexa_quality"]
283 >>> sub_record.letter_annotations
284 {'dummy': 'abcdefghij'}
285
286 You can completely clear the dictionary easily as follows:
287
288 >>> sub_record.letter_annotations = {}
289 >>> sub_record.letter_annotations
290 {}
291 """)
292
294
295 if self._per_letter_annotations:
296
297 raise ValueError("You must empty the letter annotations first!")
298 self._seq = value
299 try:
300 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
301 except AttributeError:
302
303 self._per_letter_annotations = _RestrictedDict(length=0)
304
305 seq = property(fget=lambda self : self._seq,
306 fset=_set_seq,
307 doc="The sequence itself, as a Seq or MutableSeq object.")
308
310 """Returns a sub-sequence or an individual letter.
311
312 Slicing, e.g. my_record[5:10], returns a new SeqRecord for
313 that sub-sequence with approriate annotation preserved. The
314 name, id and description are kept.
315
316 Any per-letter-annotations are sliced to match the requested
317 sub-sequence. Unless a stride is used, all those features
318 which fall fully within the subsequence are included (with
319 their locations adjusted accordingly).
320
321 However, the annotations dictionary and the dbxrefs list are
322 not used for the new SeqRecord, as in general they may not
323 apply to the subsequence. If you want to preserve them, you
324 must explictly copy them to the new SeqRecord yourself.
325
326 Using an integer index, e.g. my_record[5] is shorthand for
327 extracting that letter from the sequence, my_record.seq[5].
328
329 For example, consider this short protein and its secondary
330 structure as encoded by the PDB (e.g. H for alpha helices),
331 plus a simple feature for its histidine self phosphorylation
332 site:
333
334 >>> from Bio.Seq import Seq
335 >>> from Bio.SeqRecord import SeqRecord
336 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
337 >>> from Bio.Alphabet import IUPAC
338 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
339 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
340 ... IUPAC.protein),
341 ... id="1JOY", name="EnvZ",
342 ... description="Homodimeric domain of EnvZ from E. coli")
343 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
344 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
345 ... type = "Site"))
346
347 Now let's have a quick look at the full record,
348
349 >>> print rec
350 ID: 1JOY
351 Name: EnvZ
352 Description: Homodimeric domain of EnvZ from E. coli
353 Number of features: 1
354 Per letter annotation for: secondary_structure
355 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
356 >>> print rec.letter_annotations["secondary_structure"]
357 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
358 >>> print rec.features[0].location
359 [20:21]
360
361 Now let's take a sub sequence, here chosen as the first (fractured)
362 alpha helix which includes the histidine phosphorylation site:
363
364 >>> sub = rec[11:41]
365 >>> print sub
366 ID: 1JOY
367 Name: EnvZ
368 Description: Homodimeric domain of EnvZ from E. coli
369 Number of features: 1
370 Per letter annotation for: secondary_structure
371 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
372 >>> print sub.letter_annotations["secondary_structure"]
373 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
374 >>> print sub.features[0].location
375 [9:10]
376
377 You can also of course omit the start or end values, for
378 example to get the first ten letters only:
379
380 >>> print rec[:10]
381 ID: 1JOY
382 Name: EnvZ
383 Description: Homodimeric domain of EnvZ from E. coli
384 Number of features: 0
385 Per letter annotation for: secondary_structure
386 Seq('MAAGVKQLAD', IUPACProtein())
387
388 Or for the last ten letters:
389
390 >>> print rec[-10:]
391 ID: 1JOY
392 Name: EnvZ
393 Description: Homodimeric domain of EnvZ from E. coli
394 Number of features: 0
395 Per letter annotation for: secondary_structure
396 Seq('IIEQFIDYLR', IUPACProtein())
397
398 If you omit both, then you get a copy of the original record (although
399 lacking the annotations and dbxrefs):
400
401 >>> print rec[:]
402 ID: 1JOY
403 Name: EnvZ
404 Description: Homodimeric domain of EnvZ from E. coli
405 Number of features: 1
406 Per letter annotation for: secondary_structure
407 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
408
409 Finally, indexing with a simple integer is shorthand for pulling out
410 that letter from the sequence directly:
411
412 >>> rec[5]
413 'K'
414 >>> rec.seq[5]
415 'K'
416 """
417 if isinstance(index, int):
418
419
420
421 return self.seq[index]
422 elif isinstance(index, slice):
423 if self.seq is None:
424 raise ValueError("If the sequence is None, we cannot slice it.")
425 parent_length = len(self)
426 answer = self.__class__(self.seq[index],
427 id=self.id,
428 name=self.name,
429 description=self.description)
430
431
432
433
434
435
436
437
438
439
440
441 start, stop, step = index.indices(parent_length)
442 if step == 1:
443
444
445 for f in self.features:
446 if f.ref or f.ref_db:
447
448 import warnings
449 warnings.warn("When slicing SeqRecord objects, any "
450 "SeqFeature referencing other sequences (e.g. "
451 "from segmented GenBank records) are ignored.")
452 continue
453 if start <= f.location.nofuzzy_start \
454 and f.location.nofuzzy_end <= stop:
455 answer.features.append(f._shift(-start))
456
457
458
459 for key, value in self.letter_annotations.iteritems():
460 answer._per_letter_annotations[key] = value[index]
461
462 return answer
463 raise ValueError, "Invalid index"
464
466 """Iterate over the letters in the sequence.
467
468 For example, using Bio.SeqIO to read in a protein FASTA file:
469
470 >>> from Bio import SeqIO
471 >>> record = SeqIO.read(open("Fasta/loveliesbleeding.pro"),"fasta")
472 >>> for amino in record:
473 ... print amino
474 ... if amino == "L": break
475 X
476 A
477 G
478 L
479 >>> print record.seq[3]
480 L
481
482 This is just a shortcut for iterating over the sequence directly:
483
484 >>> for amino in record.seq:
485 ... print amino
486 ... if amino == "L": break
487 X
488 A
489 G
490 L
491 >>> print record.seq[3]
492 L
493
494 Note that this does not facilitate iteration together with any
495 per-letter-annotation. However, you can achieve that using the
496 python zip function on the record (or its sequence) and the relevant
497 per-letter-annotation:
498
499 >>> from Bio import SeqIO
500 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"),
501 ... "fastq-solexa")
502 >>> print rec.id, rec.seq
503 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
504 >>> print rec.letter_annotations.keys()
505 ['solexa_quality']
506 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]):
507 ... if qual > 35:
508 ... print nuc, qual
509 A 40
510 C 39
511 G 38
512 T 37
513 A 36
514
515 You may agree that using zip(rec.seq, ...) is more explicit than using
516 zip(rec, ...) as shown above.
517 """
518 return iter(self.seq)
519
521 """Implements the 'in' keyword, searches the sequence.
522
523 e.g.
524
525 >>> from Bio import SeqIO
526 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"), "fasta")
527 >>> "GAATTC" in record
528 False
529 >>> "AAA" in record
530 True
531
532 This essentially acts as a proxy for using "in" on the sequence:
533
534 >>> "GAATTC" in record.seq
535 False
536 >>> "AAA" in record.seq
537 True
538
539 Note that you can also use Seq objects as the query,
540
541 >>> from Bio.Seq import Seq
542 >>> from Bio.Alphabet import generic_dna
543 >>> Seq("AAA") in record
544 True
545 >>> Seq("AAA", generic_dna) in record
546 True
547
548 See also the Seq object's __contains__ method.
549 """
550 return char in self.seq
551
552
554 """A human readable summary of the record and its annotation (string).
555
556 The python built in function str works by calling the object's ___str__
557 method. e.g.
558
559 >>> from Bio.Seq import Seq
560 >>> from Bio.SeqRecord import SeqRecord
561 >>> from Bio.Alphabet import IUPAC
562 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
563 ... IUPAC.protein),
564 ... id="YP_025292.1", name="HokC",
565 ... description="toxic membrane protein, small")
566 >>> print str(record)
567 ID: YP_025292.1
568 Name: HokC
569 Description: toxic membrane protein, small
570 Number of features: 0
571 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
572
573 In this example you don't actually need to call str explicity, as the
574 print command does this automatically:
575
576 >>> print record
577 ID: YP_025292.1
578 Name: HokC
579 Description: toxic membrane protein, small
580 Number of features: 0
581 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
582
583 Note that long sequences are shown truncated.
584 """
585 lines = []
586 if self.id:
587 lines.append("ID: %s" % self.id)
588 if self.name:
589 lines.append("Name: %s" % self.name)
590 if self.description:
591 lines.append("Description: %s" % self.description)
592 if self.dbxrefs:
593 lines.append("Database cross-references: " \
594 + ", ".join(self.dbxrefs))
595 lines.append("Number of features: %i" % len(self.features))
596 for a in self.annotations:
597 lines.append("/%s=%s" % (a, str(self.annotations[a])))
598 if self.letter_annotations:
599 lines.append("Per letter annotation for: " \
600 + ", ".join(self.letter_annotations.keys()))
601
602
603 lines.append(repr(self.seq))
604 return "\n".join(lines)
605
607 """A concise summary of the record for debugging (string).
608
609 The python built in function repr works by calling the object's ___repr__
610 method. e.g.
611
612 >>> from Bio.Seq import Seq
613 >>> from Bio.SeqRecord import SeqRecord
614 >>> from Bio.Alphabet import generic_protein
615 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
616 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
617 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
618 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
619 ... generic_protein),
620 ... id="NP_418483.1", name="b4059",
621 ... description="ssDNA-binding protein",
622 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
623 >>> print repr(rec)
624 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
625
626 At the python prompt you can also use this shorthand:
627
628 >>> rec
629 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
630
631 Note that long sequences are shown truncated. Also note that any
632 annotations, letter_annotations and features are not shown (as they
633 would lead to a very long string).
634 """
635 return self.__class__.__name__ \
636 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
637 % tuple(map(repr, (self.seq, self.id, self.name,
638 self.description, self.dbxrefs)))
639
673
701
703 """Returns the length of the sequence.
704
705 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
706
707 >>> from Bio import SeqIO
708 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"),"fasta")
709 >>> len(record)
710 309
711 >>> len(record.seq)
712 309
713 """
714 return len(self.seq)
715
717 """Returns True regardless of the length of the sequence.
718
719 This behaviour is for backwards compatibility, since until the
720 __len__ method was added, a SeqRecord always evaluated as True.
721
722 Note that in comparison, a Seq object will evaluate to False if it
723 has a zero length sequence.
724
725 WARNING: The SeqRecord may in future evaluate to False when its
726 sequence is of zero length (in order to better match the Seq
727 object behaviour)!
728 """
729 return True
730
732 """Add another sequence or string to this sequence.
733
734 The other sequence can be a SeqRecord object, a Seq object (or
735 similar, e.g. a MutableSeq) or a plain Python string. If you add
736 a plain string or a Seq (like) object, the new SeqRecord will simply
737 have this appended to the existing data. However, any per letter
738 annotation will be lost:
739
740 >>> from Bio import SeqIO
741 >>> handle = open("Quality/solexa_faked.fastq", "rU")
742 >>> record = SeqIO.read(handle, "fastq-solexa")
743 >>> handle.close()
744 >>> print record.id, record.seq
745 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
746 >>> print record.letter_annotations.keys()
747 ['solexa_quality']
748
749 >>> new = record + "ACT"
750 >>> print new.id, new.seq
751 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT
752 >>> print new.letter_annotations.keys()
753 []
754
755 The new record will attempt to combine the annotation, but for any
756 ambiguities (e.g. different names) it defaults to omitting that
757 annotation.
758
759 >>> from Bio import SeqIO
760 >>> handle = open("GenBank/pBAD30.gb")
761 >>> plasmid = SeqIO.read(handle, "gb")
762 >>> handle.close()
763 >>> print plasmid.id, len(plasmid)
764 pBAD30 4923
765
766 Now let's cut the plasmid into two pieces, and join them back up the
767 other way round (i.e. shift the starting point on this plasmid, have
768 a look at the annotated features in the original file to see why this
769 particular split point might make sense):
770
771 >>> left = plasmid[:3765]
772 >>> right = plasmid[3765:]
773 >>> new = right + left
774 >>> print new.id, len(new)
775 pBAD30 4923
776 >>> str(new.seq) == str(right.seq + left.seq)
777 True
778 >>> len(new.features) == len(left.features) + len(right.features)
779 True
780
781 When we add the left and right SeqRecord objects, their annotation
782 is all consistent, so it is all conserved in the new SeqRecord:
783
784 >>> new.id == left.id == right.id == plasmid.id
785 True
786 >>> new.name == left.name == right.name == plasmid.name
787 True
788 >>> new.description == plasmid.description
789 True
790 >>> new.annotations == left.annotations == right.annotations
791 True
792 >>> new.letter_annotations == plasmid.letter_annotations
793 True
794 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs
795 True
796
797 However, we should point out that when we sliced the SeqRecord,
798 any annotations dictionary or dbxrefs list entries were lost.
799 You can explicitly copy them like this:
800
801 >>> new.annotations = plasmid.annotations.copy()
802 >>> new.dbxrefs = plasmid.dbxrefs[:]
803 """
804 if not isinstance(other, SeqRecord):
805
806
807 return SeqRecord(self.seq + other,
808 id = self.id, name = self.name,
809 description = self.description,
810 features = self.features[:],
811 annotations = self.annotations.copy(),
812 dbxrefs = self.dbxrefs[:])
813
814 answer = SeqRecord(self.seq + other.seq,
815 features = self.features[:],
816 dbxrefs = self.dbxrefs[:])
817
818 l = len(self)
819 for f in other.features:
820 answer.features.append(f._shift(l))
821 del l
822 for ref in other.dbxrefs:
823 if ref not in answer.dbxrefs:
824 answer.dbxrefs.append(ref)
825
826 if self.id == other.id:
827 answer.id = self.id
828 if self.name == other.name:
829 answer.name = self.name
830 if self.description == other.description:
831 answer.description = self.description
832 for k,v in self.annotations.iteritems():
833 if k in other.annotations and other.annotations[k] == v:
834 answer.annotations[k] = v
835
836 for k,v in self.letter_annotations.iteritems():
837 if k in other.letter_annotations:
838 answer.letter_annotations[k] = v + other.letter_annotations[k]
839 return answer
840
842 """Add another sequence or string to this sequence (from the left).
843
844 This method handles adding a Seq object (or similar, e.g. MutableSeq)
845 or a plain Python string (on the left) to a SeqRecord (on the right).
846 See the __add__ method for more details, but for example:
847
848 >>> from Bio import SeqIO
849 >>> handle = open("Quality/solexa_faked.fastq", "rU")
850 >>> record = SeqIO.read(handle, "fastq-solexa")
851 >>> handle.close()
852 >>> print record.id, record.seq
853 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
854 >>> print record.letter_annotations.keys()
855 ['solexa_quality']
856
857 >>> new = "ACT" + record
858 >>> print new.id, new.seq
859 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
860 >>> print new.letter_annotations.keys()
861 []
862 """
863 if isinstance(other, SeqRecord):
864 raise RuntimeError("This should have happened via the __add__ of "
865 "the other SeqRecord being added!")
866
867
868 offset = len(other)
869 return SeqRecord(other + self.seq,
870 id = self.id, name = self.name,
871 description = self.description,
872 features = [f._shift(offset) for f in self.features],
873 annotations = self.annotations.copy(),
874 dbxrefs = self.dbxrefs[:])
875
877 """Returns a copy of the record with an upper case sequence.
878
879 All the annotation is preserved unchanged. e.g.
880
881 >>> from Bio.Alphabet import generic_dna
882 >>> from Bio.Seq import Seq
883 >>> from Bio.SeqRecord import SeqRecord
884 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test",
885 ... description = "Made up for this example")
886 >>> record.letter_annotations["phred_quality"] = [1,2,3,4,5,6,7,8]
887 >>> print record.upper().format("fastq")
888 @Test Made up for this example
889 ACGTACGT
890 +
891 "#$%&'()
892 <BLANKLINE>
893
894 Naturally, there is a matching lower method:
895
896 >>> print record.lower().format("fastq")
897 @Test Made up for this example
898 acgtacgt
899 +
900 "#$%&'()
901 <BLANKLINE>
902 """
903 return SeqRecord(self.seq.upper(),
904 id = self.id, name = self.name,
905 description = self.description,
906 dbxrefs = self.dbxrefs[:],
907 features = self.features[:],
908 annotations = self.annotations.copy(),
909 letter_annotations=self.letter_annotations.copy())
910
912 """Returns a copy of the record with a lower case sequence.
913
914 All the annotation is preserved unchanged. e.g.
915
916 >>> from Bio import SeqIO
917 >>> record = SeqIO.read("Fasta/aster.pro", "fasta")
918 >>> print record.format("fasta")
919 >gi|3298468|dbj|BAA31520.1| SAMIPF
920 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG
921 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
922 <BLANKLINE>
923 >>> print record.lower().format("fasta")
924 >gi|3298468|dbj|BAA31520.1| SAMIPF
925 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg
926 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani
927 <BLANKLINE>
928
929 To take a more annotation rich example,
930
931 >>> from Bio import SeqIO
932 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl")
933 >>> len(old.features)
934 3
935 >>> new = old.lower()
936 >>> len(old.features) == len(new.features)
937 True
938 >>> old.annotations["organism"] == new.annotations["organism"]
939 True
940 >>> old.dbxrefs == new.dbxrefs
941 True
942 """
943 return SeqRecord(self.seq.lower(),
944 id = self.id, name = self.name,
945 description = self.description,
946 dbxrefs = self.dbxrefs[:],
947 features = self.features[:],
948 annotations = self.annotations.copy(),
949 letter_annotations=self.letter_annotations.copy())
950
951 - def reverse_complement(self, id=False, name=False, description=False,
952 features=True, annotations=False,
953 letter_annotations=True, dbxrefs=False):
954 """Returns new SeqRecord with reverse complement sequence.
955
956 You can specify the returned record's id, name and description as
957 strings, or True to keep that of the parent, or False for a default.
958
959 You can specify the returned record's features with a list of
960 SeqFeature objects, or True to keep that of the parent, or False to
961 omit them. The default is to keep the original features (with the
962 strand and locations adjusted).
963
964 You can also specify both the returned record's annotations and
965 letter_annotations as dictionaries, True to keep that of the parent,
966 or False to omit them. The default is to keep the original
967 annotations (with the letter annotations reversed).
968
969 To show what happens to the pre-letter annotations, consider an
970 example Solexa variant FASTQ file with a single entry, which we'll
971 read in as a SeqRecord:
972
973 >>> from Bio import SeqIO
974 >>> handle = open("Quality/solexa_faked.fastq", "rU")
975 >>> record = SeqIO.read(handle, "fastq-solexa")
976 >>> handle.close()
977 >>> print record.id, record.seq
978 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
979 >>> print record.letter_annotations.keys()
980 ['solexa_quality']
981 >>> print record.letter_annotations["solexa_quality"]
982 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
983
984 Now take the reverse complement,
985
986 >>> rc_record = record.reverse_complement(id=record.id+"_rc")
987 >>> print rc_record.id, rc_record.seq
988 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
989
990 Notice that the per-letter-annotations have also been reversed,
991 although this may not be appropriate for all cases.
992
993 >>> print rc_record.letter_annotations["solexa_quality"]
994 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
995
996 Now for the features, we need a different example. Parsing a GenBank
997 file is probably the easiest way to get an nice example with features
998 in it...
999
1000 >>> from Bio import SeqIO
1001 >>> handle = open("GenBank/pBAD30.gb")
1002 >>> plasmid = SeqIO.read(handle, "gb")
1003 >>> handle.close()
1004 >>> print plasmid.id, len(plasmid)
1005 pBAD30 4923
1006 >>> plasmid.seq
1007 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA())
1008 >>> len(plasmid.features)
1009 13
1010
1011 Now, let's take the reverse complement of this whole plasmid:
1012
1013 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc")
1014 >>> print rc_plasmid.id, len(rc_plasmid)
1015 pBAD30_rc 4923
1016 >>> rc_plasmid.seq
1017 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA())
1018 >>> len(rc_plasmid.features)
1019 13
1020
1021 Let's compare the first CDS feature - it has gone from being the
1022 second feature (index 1) to the second last feature (index -2), its
1023 strand has changed, and the location switched round.
1024
1025 >>> print plasmid.features[1]
1026 type: CDS
1027 location: [1081:1960]
1028 strand: -1
1029 qualifiers:
1030 Key: label, Value: ['araC']
1031 Key: note, Value: ['araC regulator of the arabinose BAD promoter']
1032 Key: vntifkey, Value: ['4']
1033 <BLANKLINE>
1034 >>> print rc_plasmid.features[-2]
1035 type: CDS
1036 location: [2963:3842]
1037 strand: 1
1038 qualifiers:
1039 Key: label, Value: ['araC']
1040 Key: note, Value: ['araC regulator of the arabinose BAD promoter']
1041 Key: vntifkey, Value: ['4']
1042 <BLANKLINE>
1043
1044 You can check this new location, based on the length of the plasmid:
1045
1046 >>> len(plasmid) - 1081
1047 3842
1048 >>> len(plasmid) - 1960
1049 2963
1050
1051 Note that if the SeqFeature annotation includes any strand specific
1052 information (e.g. base changes for a SNP), this information is not
1053 ammended, and would need correction after the reverse complement.
1054
1055 Note trying to reverse complement a protein SeqRecord raises an
1056 exception:
1057
1058 >>> from Bio.SeqRecord import SeqRecord
1059 >>> from Bio.Seq import Seq
1060 >>> from Bio.Alphabet import IUPAC
1061 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test")
1062 >>> protein_rec.reverse_complement()
1063 Traceback (most recent call last):
1064 ...
1065 ValueError: Proteins do not have complements!
1066
1067 Also note you can reverse complement a SeqRecord using a MutableSeq:
1068
1069 >>> from Bio.SeqRecord import SeqRecord
1070 >>> from Bio.Seq import MutableSeq
1071 >>> from Bio.Alphabet import generic_dna
1072 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test")
1073 >>> rec.seq[0] = "T"
1074 >>> print rec.id, rec.seq
1075 Test TCGT
1076 >>> rc = rec.reverse_complement(id=True)
1077 >>> print rc.id, rc.seq
1078 Test ACGA
1079 """
1080 from Bio.Seq import MutableSeq
1081 if isinstance(self.seq, MutableSeq):
1082
1083 answer = SeqRecord(self.seq.toseq().reverse_complement())
1084 else:
1085 answer = SeqRecord(self.seq.reverse_complement())
1086 if isinstance(id, basestring):
1087 answer.id = id
1088 elif id:
1089 answer.id = self.id
1090 if isinstance(name, basestring):
1091 answer.name = name
1092 elif name:
1093 answer.name = self.name
1094 if isinstance(description, basestring):
1095 answer.description = description
1096 elif description:
1097 answer.description = self.description
1098 if isinstance(dbxrefs, list):
1099 answer.dbxrefs = dbxrefs
1100 elif dbxrefs:
1101
1102 answer.dbxrefs = self.dbxrefs[:]
1103 if isinstance(features, list):
1104 answer.features = features
1105 elif features:
1106
1107 l = len(answer)
1108 answer.features = [f._flip(l) for f in self.features]
1109
1110
1111
1112
1113
1114 answer.features.sort(key=lambda x : x.location.start.position)
1115 if isinstance(annotations, dict):
1116 answer.annotations = annotations
1117 elif annotations:
1118
1119 answer.annotations = self.annotations.copy()
1120 if isinstance(letter_annotations, dict):
1121 answer.letter_annotations = letter_annotations
1122 elif letter_annotations:
1123
1124 for key, value in self.letter_annotations.iteritems():
1125 answer._per_letter_annotations[key] = value[::-1]
1126 return answer
1127
1129 """Run the Bio.SeqRecord module's doctests (PRIVATE).
1130
1131 This will try and locate the unit tests directory, and run the doctests
1132 from there in order that the relative paths used in the examples work.
1133 """
1134 import doctest
1135 import os
1136 if os.path.isdir(os.path.join("..","Tests")):
1137 print "Runing doctests..."
1138 cur_dir = os.path.abspath(os.curdir)
1139 os.chdir(os.path.join("..","Tests"))
1140 doctest.testmod()
1141 os.chdir(cur_dir)
1142 del cur_dir
1143 print "Done"
1144 elif os.path.isdir(os.path.join("Tests")):
1145 print "Runing doctests..."
1146 cur_dir = os.path.abspath(os.curdir)
1147 os.chdir(os.path.join("Tests"))
1148 doctest.testmod()
1149 os.chdir(cur_dir)
1150 del cur_dir
1151 print "Done"
1152
1153 if __name__ == "__main__":
1154 _test()
1155