Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9  __docformat__ = "epytext en" #Simple markup to show doctests nicely 
  10   
  11  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  12  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  13  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  14  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  15   
16 -class _RestrictedDict(dict):
17 """Dict which only allows sequences of given length as values (PRIVATE). 18 19 This simple subclass of the Python dictionary is used in the SeqRecord 20 object for holding per-letter-annotations. This class is intended to 21 prevent simple errors by only allowing python sequences (e.g. lists, 22 strings and tuples) to be stored, and only if their length matches that 23 expected (the length of the SeqRecord's seq object). It cannot however 24 prevent the entries being edited in situ (for example appending entries 25 to a list). 26 27 >>> x = _RestrictedDict(5) 28 >>> x["test"] = "hello" 29 >>> x 30 {'test': 'hello'} 31 32 Adding entries which don't have the expected length are blocked: 33 34 >>> x["test"] = "hello world" 35 Traceback (most recent call last): 36 ... 37 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 38 39 The expected length is stored as a private attribute, 40 41 >>> x._length 42 5 43 44 In order that the SeqRecord (and other objects using this class) can be 45 pickled, for example for use in the multiprocessing library, we need to 46 be able to pickle the restricted dictionary objects. 47 48 Using the default protocol, which is 0 on Python 2.x, 49 50 >>> import pickle 51 >>> y = pickle.loads(pickle.dumps(x)) 52 >>> y 53 {'test': 'hello'} 54 >>> y._length 55 5 56 57 Using the highest protocol, which is 2 on Python 2.x, 58 59 >>> import pickle 60 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 61 >>> z 62 {'test': 'hello'} 63 >>> z._length 64 5 65 """ 66
67 - def __init__(self, length):
68 """Create an EMPTY restricted dictionary.""" 69 dict.__init__(self) 70 self._length = int(length)
71
72 - def __setitem__(self, key, value):
73 #The check hasattr(self, "_length") is to cope with pickle protocol 2 74 #I couldn't seem to avoid this with __getstate__ and __setstate__ 75 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \ 76 or (hasattr(self, "_length") and len(value) != self._length): 77 raise TypeError("We only allow python sequences (lists, tuples or " 78 "strings) of length %i." % self._length) 79 dict.__setitem__(self, key, value)
80
81 - def update(self, new_dict):
82 #Force this to go via our strict __setitem__ method 83 for (key, value) in new_dict.iteritems(): 84 self[key] = value
85 86
87 -class SeqRecord(object):
88 """A SeqRecord object holds a sequence and information about it. 89 90 Main attributes: 91 - id - Identifier such as a locus tag (string) 92 - seq - The sequence itself (Seq object or similar) 93 94 Additional attributes: 95 - name - Sequence name, e.g. gene name (string) 96 - description - Additional text (string) 97 - dbxrefs - List of database cross references (list of strings) 98 - features - Any (sub)features defined (list of SeqFeature objects) 99 - annotations - Further information about the whole sequence (dictionary) 100 Most entries are strings, or lists of strings. 101 - letter_annotations - Per letter/symbol annotation (restricted 102 dictionary). This holds Python sequences (lists, strings 103 or tuples) whose length matches that of the sequence. 104 A typical use would be to hold a list of integers 105 representing sequencing quality scores, or a string 106 representing the secondary structure. 107 108 You will typically use Bio.SeqIO to read in sequences from files as 109 SeqRecord objects. However, you may want to create your own SeqRecord 110 objects directly (see the __init__ method for further details): 111 112 >>> from Bio.Seq import Seq 113 >>> from Bio.SeqRecord import SeqRecord 114 >>> from Bio.Alphabet import IUPAC 115 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 116 ... IUPAC.protein), 117 ... id="YP_025292.1", name="HokC", 118 ... description="toxic membrane protein") 119 >>> print record 120 ID: YP_025292.1 121 Name: HokC 122 Description: toxic membrane protein 123 Number of features: 0 124 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 125 126 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 127 for this. For the special case where you want the SeqRecord turned into 128 a string in a particular file format there is a format method which uses 129 Bio.SeqIO internally: 130 131 >>> print record.format("fasta") 132 >YP_025292.1 toxic membrane protein 133 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 134 <BLANKLINE> 135 136 You can also do things like slicing a SeqRecord, checking its length, etc 137 138 >>> len(record) 139 44 140 >>> edited = record[:10] + record[11:] 141 >>> print edited.seq 142 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 143 >>> print record.seq 144 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 145 146 """
147 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>", 148 description = "<unknown description>", dbxrefs = None, 149 features = None, annotations = None, 150 letter_annotations = None):
151 """Create a SeqRecord. 152 153 Arguments: 154 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 155 - id - Sequence identifier, recommended (string) 156 - name - Sequence name, optional (string) 157 - description - Sequence description, optional (string) 158 - dbxrefs - Database cross references, optional (list of strings) 159 - features - Any (sub)features, optional (list of SeqFeature objects) 160 - annotations - Dictionary of annotations for the whole sequence 161 - letter_annotations - Dictionary of per-letter-annotations, values 162 should be strings, list or tuples of the same 163 length as the full sequence. 164 165 You will typically use Bio.SeqIO to read in sequences from files as 166 SeqRecord objects. However, you may want to create your own SeqRecord 167 objects directly. 168 169 Note that while an id is optional, we strongly recommend you supply a 170 unique id string for each record. This is especially important 171 if you wish to write your sequences to a file. 172 173 If you don't have the actual sequence, but you do know its length, 174 then using the UnknownSeq object from Bio.Seq is appropriate. 175 176 You can create a 'blank' SeqRecord object, and then populate the 177 attributes later. 178 """ 179 if id is not None and not isinstance(id, basestring): 180 #Lots of existing code uses id=None... this may be a bad idea. 181 raise TypeError("id argument should be a string") 182 if not isinstance(name, basestring): 183 raise TypeError("name argument should be a string") 184 if not isinstance(description, basestring): 185 raise TypeError("description argument should be a string") 186 self._seq = seq 187 self.id = id 188 self.name = name 189 self.description = description 190 191 # database cross references (for the whole sequence) 192 if dbxrefs is None: 193 dbxrefs = [] 194 elif not isinstance(dbxrefs, list): 195 raise TypeError("dbxrefs argument should be a list (of strings)") 196 self.dbxrefs = dbxrefs 197 198 # annotations about the whole sequence 199 if annotations is None: 200 annotations = {} 201 elif not isinstance(annotations, dict): 202 raise TypeError("annotations argument should be a dict") 203 self.annotations = annotations 204 205 if letter_annotations is None: 206 # annotations about each letter in the sequence 207 if seq is None: 208 #Should we allow this and use a normal unrestricted dict? 209 self._per_letter_annotations = _RestrictedDict(length=0) 210 else: 211 try: 212 self._per_letter_annotations = \ 213 _RestrictedDict(length=len(seq)) 214 except: 215 raise TypeError("seq argument should be a Seq object or similar") 216 else: 217 #This will be handled via the property set function, which will 218 #turn this into a _RestrictedDict and thus ensure all the values 219 #in the dict are the right length 220 self.letter_annotations = letter_annotations 221 222 # annotations about parts of the sequence 223 if features is None: 224 features = [] 225 elif not isinstance(features, list): 226 raise TypeError("features argument should be a list (of SeqFeature objects)") 227 self.features = features
228 229 #TODO - Just make this a read only property?
230 - def _set_per_letter_annotations(self, value):
231 if not isinstance(value, dict): 232 raise TypeError("The per-letter-annotations should be a " 233 "(restricted) dictionary.") 234 #Turn this into a restricted-dictionary (and check the entries) 235 try: 236 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 237 except AttributeError: 238 #e.g. seq is None 239 self._per_letter_annotations = _RestrictedDict(length=0) 240 self._per_letter_annotations.update(value)
241 letter_annotations = property( \ 242 fget=lambda self : self._per_letter_annotations, 243 fset=_set_per_letter_annotations, 244 doc="""Dictionary of per-letter-annotation for the sequence. 245 246 For example, this can hold quality scores used in FASTQ or QUAL files. 247 Consider this example using Bio.SeqIO to read in an example Solexa 248 variant FASTQ file as a SeqRecord: 249 250 >>> from Bio import SeqIO 251 >>> handle = open("Quality/solexa_faked.fastq", "rU") 252 >>> record = SeqIO.read(handle, "fastq-solexa") 253 >>> handle.close() 254 >>> print record.id, record.seq 255 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 256 >>> print record.letter_annotations.keys() 257 ['solexa_quality'] 258 >>> print record.letter_annotations["solexa_quality"] 259 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 260 261 The letter_annotations get sliced automatically if you slice the 262 parent SeqRecord, for example taking the last ten bases: 263 264 >>> sub_record = record[-10:] 265 >>> print sub_record.id, sub_record.seq 266 slxa_0001_1_0001_01 ACGTNNNNNN 267 >>> print sub_record.letter_annotations["solexa_quality"] 268 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 269 270 Any python sequence (i.e. list, tuple or string) can be recorded in 271 the SeqRecord's letter_annotations dictionary as long as the length 272 matches that of the SeqRecord's sequence. e.g. 273 274 >>> len(sub_record.letter_annotations) 275 1 276 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 277 >>> len(sub_record.letter_annotations) 278 2 279 280 You can delete entries from the letter_annotations dictionary as usual: 281 282 >>> del sub_record.letter_annotations["solexa_quality"] 283 >>> sub_record.letter_annotations 284 {'dummy': 'abcdefghij'} 285 286 You can completely clear the dictionary easily as follows: 287 288 >>> sub_record.letter_annotations = {} 289 >>> sub_record.letter_annotations 290 {} 291 """) 292
293 - def _set_seq(self, value):
294 #TODO - Add a deprecation warning that the seq should be write only? 295 if self._per_letter_annotations: 296 #TODO - Make this a warning? Silently empty the dictionary? 297 raise ValueError("You must empty the letter annotations first!") 298 self._seq = value 299 try: 300 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 301 except AttributeError: 302 #e.g. seq is None 303 self._per_letter_annotations = _RestrictedDict(length=0)
304 305 seq = property(fget=lambda self : self._seq, 306 fset=_set_seq, 307 doc="The sequence itself, as a Seq or MutableSeq object.") 308
309 - def __getitem__(self, index):
310 """Returns a sub-sequence or an individual letter. 311 312 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 313 that sub-sequence with approriate annotation preserved. The 314 name, id and description are kept. 315 316 Any per-letter-annotations are sliced to match the requested 317 sub-sequence. Unless a stride is used, all those features 318 which fall fully within the subsequence are included (with 319 their locations adjusted accordingly). 320 321 However, the annotations dictionary and the dbxrefs list are 322 not used for the new SeqRecord, as in general they may not 323 apply to the subsequence. If you want to preserve them, you 324 must explictly copy them to the new SeqRecord yourself. 325 326 Using an integer index, e.g. my_record[5] is shorthand for 327 extracting that letter from the sequence, my_record.seq[5]. 328 329 For example, consider this short protein and its secondary 330 structure as encoded by the PDB (e.g. H for alpha helices), 331 plus a simple feature for its histidine self phosphorylation 332 site: 333 334 >>> from Bio.Seq import Seq 335 >>> from Bio.SeqRecord import SeqRecord 336 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 337 >>> from Bio.Alphabet import IUPAC 338 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 339 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 340 ... IUPAC.protein), 341 ... id="1JOY", name="EnvZ", 342 ... description="Homodimeric domain of EnvZ from E. coli") 343 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 344 >>> rec.features.append(SeqFeature(FeatureLocation(20,21), 345 ... type = "Site")) 346 347 Now let's have a quick look at the full record, 348 349 >>> print rec 350 ID: 1JOY 351 Name: EnvZ 352 Description: Homodimeric domain of EnvZ from E. coli 353 Number of features: 1 354 Per letter annotation for: secondary_structure 355 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 356 >>> print rec.letter_annotations["secondary_structure"] 357 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 358 >>> print rec.features[0].location 359 [20:21] 360 361 Now let's take a sub sequence, here chosen as the first (fractured) 362 alpha helix which includes the histidine phosphorylation site: 363 364 >>> sub = rec[11:41] 365 >>> print sub 366 ID: 1JOY 367 Name: EnvZ 368 Description: Homodimeric domain of EnvZ from E. coli 369 Number of features: 1 370 Per letter annotation for: secondary_structure 371 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 372 >>> print sub.letter_annotations["secondary_structure"] 373 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 374 >>> print sub.features[0].location 375 [9:10] 376 377 You can also of course omit the start or end values, for 378 example to get the first ten letters only: 379 380 >>> print rec[:10] 381 ID: 1JOY 382 Name: EnvZ 383 Description: Homodimeric domain of EnvZ from E. coli 384 Number of features: 0 385 Per letter annotation for: secondary_structure 386 Seq('MAAGVKQLAD', IUPACProtein()) 387 388 Or for the last ten letters: 389 390 >>> print rec[-10:] 391 ID: 1JOY 392 Name: EnvZ 393 Description: Homodimeric domain of EnvZ from E. coli 394 Number of features: 0 395 Per letter annotation for: secondary_structure 396 Seq('IIEQFIDYLR', IUPACProtein()) 397 398 If you omit both, then you get a copy of the original record (although 399 lacking the annotations and dbxrefs): 400 401 >>> print rec[:] 402 ID: 1JOY 403 Name: EnvZ 404 Description: Homodimeric domain of EnvZ from E. coli 405 Number of features: 1 406 Per letter annotation for: secondary_structure 407 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 408 409 Finally, indexing with a simple integer is shorthand for pulling out 410 that letter from the sequence directly: 411 412 >>> rec[5] 413 'K' 414 >>> rec.seq[5] 415 'K' 416 """ 417 if isinstance(index, int): 418 #NOTE - The sequence level annotation like the id, name, etc 419 #do not really apply to a single character. However, should 420 #we try and expose any per-letter-annotation here? If so how? 421 return self.seq[index] 422 elif isinstance(index, slice): 423 if self.seq is None: 424 raise ValueError("If the sequence is None, we cannot slice it.") 425 parent_length = len(self) 426 answer = self.__class__(self.seq[index], 427 id=self.id, 428 name=self.name, 429 description=self.description) 430 #TODO - The desription may no longer apply. 431 #It would be safer to change it to something 432 #generic like "edited" or the default value. 433 434 #Don't copy the annotation dict and dbxefs list, 435 #they may not apply to a subsequence. 436 #answer.annotations = dict(self.annotations.iteritems()) 437 #answer.dbxrefs = self.dbxrefs[:] 438 #TODO - Review this in light of adding SeqRecord objects? 439 440 #TODO - Cope with strides by generating ambiguous locations? 441 start, stop, step = index.indices(parent_length) 442 if step == 1: 443 #Select relevant features, add them with shifted locations 444 #assert str(self.seq)[index] == str(self.seq)[start:stop] 445 for f in self.features: 446 if f.ref or f.ref_db: 447 #TODO - Implement this (with lots of tests)? 448 import warnings 449 warnings.warn("When slicing SeqRecord objects, any " 450 "SeqFeature referencing other sequences (e.g. " 451 "from segmented GenBank records) are ignored.") 452 continue 453 if start <= f.location.nofuzzy_start \ 454 and f.location.nofuzzy_end <= stop: 455 answer.features.append(f._shift(-start)) 456 457 #Slice all the values to match the sliced sequence 458 #(this should also work with strides, even negative strides): 459 for key, value in self.letter_annotations.iteritems(): 460 answer._per_letter_annotations[key] = value[index] 461 462 return answer 463 raise ValueError, "Invalid index"
464
465 - def __iter__(self):
466 """Iterate over the letters in the sequence. 467 468 For example, using Bio.SeqIO to read in a protein FASTA file: 469 470 >>> from Bio import SeqIO 471 >>> record = SeqIO.read(open("Fasta/loveliesbleeding.pro"),"fasta") 472 >>> for amino in record: 473 ... print amino 474 ... if amino == "L": break 475 X 476 A 477 G 478 L 479 >>> print record.seq[3] 480 L 481 482 This is just a shortcut for iterating over the sequence directly: 483 484 >>> for amino in record.seq: 485 ... print amino 486 ... if amino == "L": break 487 X 488 A 489 G 490 L 491 >>> print record.seq[3] 492 L 493 494 Note that this does not facilitate iteration together with any 495 per-letter-annotation. However, you can achieve that using the 496 python zip function on the record (or its sequence) and the relevant 497 per-letter-annotation: 498 499 >>> from Bio import SeqIO 500 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"), 501 ... "fastq-solexa") 502 >>> print rec.id, rec.seq 503 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 504 >>> print rec.letter_annotations.keys() 505 ['solexa_quality'] 506 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]): 507 ... if qual > 35: 508 ... print nuc, qual 509 A 40 510 C 39 511 G 38 512 T 37 513 A 36 514 515 You may agree that using zip(rec.seq, ...) is more explicit than using 516 zip(rec, ...) as shown above. 517 """ 518 return iter(self.seq)
519
520 - def __contains__(self, char):
521 """Implements the 'in' keyword, searches the sequence. 522 523 e.g. 524 525 >>> from Bio import SeqIO 526 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"), "fasta") 527 >>> "GAATTC" in record 528 False 529 >>> "AAA" in record 530 True 531 532 This essentially acts as a proxy for using "in" on the sequence: 533 534 >>> "GAATTC" in record.seq 535 False 536 >>> "AAA" in record.seq 537 True 538 539 Note that you can also use Seq objects as the query, 540 541 >>> from Bio.Seq import Seq 542 >>> from Bio.Alphabet import generic_dna 543 >>> Seq("AAA") in record 544 True 545 >>> Seq("AAA", generic_dna) in record 546 True 547 548 See also the Seq object's __contains__ method. 549 """ 550 return char in self.seq
551 552
553 - def __str__(self):
554 """A human readable summary of the record and its annotation (string). 555 556 The python built in function str works by calling the object's ___str__ 557 method. e.g. 558 559 >>> from Bio.Seq import Seq 560 >>> from Bio.SeqRecord import SeqRecord 561 >>> from Bio.Alphabet import IUPAC 562 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 563 ... IUPAC.protein), 564 ... id="YP_025292.1", name="HokC", 565 ... description="toxic membrane protein, small") 566 >>> print str(record) 567 ID: YP_025292.1 568 Name: HokC 569 Description: toxic membrane protein, small 570 Number of features: 0 571 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 572 573 In this example you don't actually need to call str explicity, as the 574 print command does this automatically: 575 576 >>> print record 577 ID: YP_025292.1 578 Name: HokC 579 Description: toxic membrane protein, small 580 Number of features: 0 581 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 582 583 Note that long sequences are shown truncated. 584 """ 585 lines = [] 586 if self.id: 587 lines.append("ID: %s" % self.id) 588 if self.name: 589 lines.append("Name: %s" % self.name) 590 if self.description: 591 lines.append("Description: %s" % self.description) 592 if self.dbxrefs: 593 lines.append("Database cross-references: " \ 594 + ", ".join(self.dbxrefs)) 595 lines.append("Number of features: %i" % len(self.features)) 596 for a in self.annotations: 597 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 598 if self.letter_annotations: 599 lines.append("Per letter annotation for: " \ 600 + ", ".join(self.letter_annotations.keys())) 601 #Don't want to include the entire sequence, 602 #and showing the alphabet is useful: 603 lines.append(repr(self.seq)) 604 return "\n".join(lines)
605
606 - def __repr__(self):
607 """A concise summary of the record for debugging (string). 608 609 The python built in function repr works by calling the object's ___repr__ 610 method. e.g. 611 612 >>> from Bio.Seq import Seq 613 >>> from Bio.SeqRecord import SeqRecord 614 >>> from Bio.Alphabet import generic_protein 615 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 616 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 617 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 618 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 619 ... generic_protein), 620 ... id="NP_418483.1", name="b4059", 621 ... description="ssDNA-binding protein", 622 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 623 >>> print repr(rec) 624 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 625 626 At the python prompt you can also use this shorthand: 627 628 >>> rec 629 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 630 631 Note that long sequences are shown truncated. Also note that any 632 annotations, letter_annotations and features are not shown (as they 633 would lead to a very long string). 634 """ 635 return self.__class__.__name__ \ 636 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 637 % tuple(map(repr, (self.seq, self.id, self.name, 638 self.description, self.dbxrefs)))
639
640 - def format(self, format):
641 r"""Returns the record as a string in the specified file format. 642 643 The format should be a lower case string supported as an output 644 format by Bio.SeqIO, which is used to turn the SeqRecord into a 645 string. e.g. 646 647 >>> from Bio.Seq import Seq 648 >>> from Bio.SeqRecord import SeqRecord 649 >>> from Bio.Alphabet import IUPAC 650 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 651 ... IUPAC.protein), 652 ... id="YP_025292.1", name="HokC", 653 ... description="toxic membrane protein") 654 >>> record.format("fasta") 655 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 656 >>> print record.format("fasta") 657 >YP_025292.1 toxic membrane protein 658 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 659 <BLANKLINE> 660 661 The python print command automatically appends a new line, meaning 662 in this example a blank line is shown. If you look at the string 663 representation you can see there is a trailing new line (shown as 664 slash n) which is important when writing to a file or if 665 concatenating mutliple sequence strings together. 666 667 Note that this method will NOT work on every possible file format 668 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 669 """ 670 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 671 #See also the Bio.Align.Generic.Alignment class and its format() 672 return self.__format__(format)
673
674 - def __format__(self, format_spec):
675 """Returns the record as a string in the specified file format. 676 677 This method supports the python format() function added in 678 Python 2.6/3.0. The format_spec should be a lower case string 679 supported by Bio.SeqIO as an output file format. See also the 680 SeqRecord's format() method. 681 """ 682 if not format_spec: 683 #Follow python convention and default to using __str__ 684 return str(self) 685 from Bio import SeqIO 686 if format_spec in SeqIO._BinaryFormats: 687 #Return bytes on Python 3 688 try: 689 #This is in Python 2.6+, but we need it on Python 3 690 from io import BytesIO 691 handle = BytesIO() 692 except ImportError: 693 #Must be on Python 2.5 or older 694 from StringIO import StringIO 695 handle = StringIO() 696 else: 697 from StringIO import StringIO 698 handle = StringIO() 699 SeqIO.write(self, handle, format_spec) 700 return handle.getvalue()
701
702 - def __len__(self):
703 """Returns the length of the sequence. 704 705 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 706 707 >>> from Bio import SeqIO 708 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"),"fasta") 709 >>> len(record) 710 309 711 >>> len(record.seq) 712 309 713 """ 714 return len(self.seq)
715
716 - def __nonzero__(self):
717 """Returns True regardless of the length of the sequence. 718 719 This behaviour is for backwards compatibility, since until the 720 __len__ method was added, a SeqRecord always evaluated as True. 721 722 Note that in comparison, a Seq object will evaluate to False if it 723 has a zero length sequence. 724 725 WARNING: The SeqRecord may in future evaluate to False when its 726 sequence is of zero length (in order to better match the Seq 727 object behaviour)! 728 """ 729 return True
730
731 - def __add__(self, other):
732 """Add another sequence or string to this sequence. 733 734 The other sequence can be a SeqRecord object, a Seq object (or 735 similar, e.g. a MutableSeq) or a plain Python string. If you add 736 a plain string or a Seq (like) object, the new SeqRecord will simply 737 have this appended to the existing data. However, any per letter 738 annotation will be lost: 739 740 >>> from Bio import SeqIO 741 >>> handle = open("Quality/solexa_faked.fastq", "rU") 742 >>> record = SeqIO.read(handle, "fastq-solexa") 743 >>> handle.close() 744 >>> print record.id, record.seq 745 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 746 >>> print record.letter_annotations.keys() 747 ['solexa_quality'] 748 749 >>> new = record + "ACT" 750 >>> print new.id, new.seq 751 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 752 >>> print new.letter_annotations.keys() 753 [] 754 755 The new record will attempt to combine the annotation, but for any 756 ambiguities (e.g. different names) it defaults to omitting that 757 annotation. 758 759 >>> from Bio import SeqIO 760 >>> handle = open("GenBank/pBAD30.gb") 761 >>> plasmid = SeqIO.read(handle, "gb") 762 >>> handle.close() 763 >>> print plasmid.id, len(plasmid) 764 pBAD30 4923 765 766 Now let's cut the plasmid into two pieces, and join them back up the 767 other way round (i.e. shift the starting point on this plasmid, have 768 a look at the annotated features in the original file to see why this 769 particular split point might make sense): 770 771 >>> left = plasmid[:3765] 772 >>> right = plasmid[3765:] 773 >>> new = right + left 774 >>> print new.id, len(new) 775 pBAD30 4923 776 >>> str(new.seq) == str(right.seq + left.seq) 777 True 778 >>> len(new.features) == len(left.features) + len(right.features) 779 True 780 781 When we add the left and right SeqRecord objects, their annotation 782 is all consistent, so it is all conserved in the new SeqRecord: 783 784 >>> new.id == left.id == right.id == plasmid.id 785 True 786 >>> new.name == left.name == right.name == plasmid.name 787 True 788 >>> new.description == plasmid.description 789 True 790 >>> new.annotations == left.annotations == right.annotations 791 True 792 >>> new.letter_annotations == plasmid.letter_annotations 793 True 794 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 795 True 796 797 However, we should point out that when we sliced the SeqRecord, 798 any annotations dictionary or dbxrefs list entries were lost. 799 You can explicitly copy them like this: 800 801 >>> new.annotations = plasmid.annotations.copy() 802 >>> new.dbxrefs = plasmid.dbxrefs[:] 803 """ 804 if not isinstance(other, SeqRecord): 805 #Assume it is a string or a Seq. 806 #Note can't transfer any per-letter-annotations 807 return SeqRecord(self.seq + other, 808 id = self.id, name = self.name, 809 description = self.description, 810 features = self.features[:], 811 annotations = self.annotations.copy(), 812 dbxrefs = self.dbxrefs[:]) 813 #Adding two SeqRecord objects... must merge annotation. 814 answer = SeqRecord(self.seq + other.seq, 815 features = self.features[:], 816 dbxrefs = self.dbxrefs[:]) 817 #Will take all the features and all the db cross refs, 818 l = len(self) 819 for f in other.features: 820 answer.features.append(f._shift(l)) 821 del l 822 for ref in other.dbxrefs: 823 if ref not in answer.dbxrefs: 824 answer.dbxrefs.append(ref) 825 #Take common id/name/description/annotation 826 if self.id == other.id: 827 answer.id = self.id 828 if self.name == other.name: 829 answer.name = self.name 830 if self.description == other.description: 831 answer.description = self.description 832 for k,v in self.annotations.iteritems(): 833 if k in other.annotations and other.annotations[k] == v: 834 answer.annotations[k] = v 835 #Can append matching per-letter-annotation 836 for k,v in self.letter_annotations.iteritems(): 837 if k in other.letter_annotations: 838 answer.letter_annotations[k] = v + other.letter_annotations[k] 839 return answer
840
841 - def __radd__(self, other):
842 """Add another sequence or string to this sequence (from the left). 843 844 This method handles adding a Seq object (or similar, e.g. MutableSeq) 845 or a plain Python string (on the left) to a SeqRecord (on the right). 846 See the __add__ method for more details, but for example: 847 848 >>> from Bio import SeqIO 849 >>> handle = open("Quality/solexa_faked.fastq", "rU") 850 >>> record = SeqIO.read(handle, "fastq-solexa") 851 >>> handle.close() 852 >>> print record.id, record.seq 853 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 854 >>> print record.letter_annotations.keys() 855 ['solexa_quality'] 856 857 >>> new = "ACT" + record 858 >>> print new.id, new.seq 859 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 860 >>> print new.letter_annotations.keys() 861 [] 862 """ 863 if isinstance(other, SeqRecord): 864 raise RuntimeError("This should have happened via the __add__ of " 865 "the other SeqRecord being added!") 866 #Assume it is a string or a Seq. 867 #Note can't transfer any per-letter-annotations 868 offset = len(other) 869 return SeqRecord(other + self.seq, 870 id = self.id, name = self.name, 871 description = self.description, 872 features = [f._shift(offset) for f in self.features], 873 annotations = self.annotations.copy(), 874 dbxrefs = self.dbxrefs[:])
875
876 - def upper(self):
877 """Returns a copy of the record with an upper case sequence. 878 879 All the annotation is preserved unchanged. e.g. 880 881 >>> from Bio.Alphabet import generic_dna 882 >>> from Bio.Seq import Seq 883 >>> from Bio.SeqRecord import SeqRecord 884 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 885 ... description = "Made up for this example") 886 >>> record.letter_annotations["phred_quality"] = [1,2,3,4,5,6,7,8] 887 >>> print record.upper().format("fastq") 888 @Test Made up for this example 889 ACGTACGT 890 + 891 "#$%&'() 892 <BLANKLINE> 893 894 Naturally, there is a matching lower method: 895 896 >>> print record.lower().format("fastq") 897 @Test Made up for this example 898 acgtacgt 899 + 900 "#$%&'() 901 <BLANKLINE> 902 """ 903 return SeqRecord(self.seq.upper(), 904 id = self.id, name = self.name, 905 description = self.description, 906 dbxrefs = self.dbxrefs[:], 907 features = self.features[:], 908 annotations = self.annotations.copy(), 909 letter_annotations=self.letter_annotations.copy())
910
911 - def lower(self):
912 """Returns a copy of the record with a lower case sequence. 913 914 All the annotation is preserved unchanged. e.g. 915 916 >>> from Bio import SeqIO 917 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 918 >>> print record.format("fasta") 919 >gi|3298468|dbj|BAA31520.1| SAMIPF 920 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 921 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 922 <BLANKLINE> 923 >>> print record.lower().format("fasta") 924 >gi|3298468|dbj|BAA31520.1| SAMIPF 925 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 926 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 927 <BLANKLINE> 928 929 To take a more annotation rich example, 930 931 >>> from Bio import SeqIO 932 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 933 >>> len(old.features) 934 3 935 >>> new = old.lower() 936 >>> len(old.features) == len(new.features) 937 True 938 >>> old.annotations["organism"] == new.annotations["organism"] 939 True 940 >>> old.dbxrefs == new.dbxrefs 941 True 942 """ 943 return SeqRecord(self.seq.lower(), 944 id = self.id, name = self.name, 945 description = self.description, 946 dbxrefs = self.dbxrefs[:], 947 features = self.features[:], 948 annotations = self.annotations.copy(), 949 letter_annotations=self.letter_annotations.copy())
950
951 - def reverse_complement(self, id=False, name=False, description=False, 952 features=True, annotations=False, 953 letter_annotations=True, dbxrefs=False):
954 """Returns new SeqRecord with reverse complement sequence. 955 956 You can specify the returned record's id, name and description as 957 strings, or True to keep that of the parent, or False for a default. 958 959 You can specify the returned record's features with a list of 960 SeqFeature objects, or True to keep that of the parent, or False to 961 omit them. The default is to keep the original features (with the 962 strand and locations adjusted). 963 964 You can also specify both the returned record's annotations and 965 letter_annotations as dictionaries, True to keep that of the parent, 966 or False to omit them. The default is to keep the original 967 annotations (with the letter annotations reversed). 968 969 To show what happens to the pre-letter annotations, consider an 970 example Solexa variant FASTQ file with a single entry, which we'll 971 read in as a SeqRecord: 972 973 >>> from Bio import SeqIO 974 >>> handle = open("Quality/solexa_faked.fastq", "rU") 975 >>> record = SeqIO.read(handle, "fastq-solexa") 976 >>> handle.close() 977 >>> print record.id, record.seq 978 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 979 >>> print record.letter_annotations.keys() 980 ['solexa_quality'] 981 >>> print record.letter_annotations["solexa_quality"] 982 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 983 984 Now take the reverse complement, 985 986 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 987 >>> print rc_record.id, rc_record.seq 988 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 989 990 Notice that the per-letter-annotations have also been reversed, 991 although this may not be appropriate for all cases. 992 993 >>> print rc_record.letter_annotations["solexa_quality"] 994 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 995 996 Now for the features, we need a different example. Parsing a GenBank 997 file is probably the easiest way to get an nice example with features 998 in it... 999 1000 >>> from Bio import SeqIO 1001 >>> handle = open("GenBank/pBAD30.gb") 1002 >>> plasmid = SeqIO.read(handle, "gb") 1003 >>> handle.close() 1004 >>> print plasmid.id, len(plasmid) 1005 pBAD30 4923 1006 >>> plasmid.seq 1007 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1008 >>> len(plasmid.features) 1009 13 1010 1011 Now, let's take the reverse complement of this whole plasmid: 1012 1013 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1014 >>> print rc_plasmid.id, len(rc_plasmid) 1015 pBAD30_rc 4923 1016 >>> rc_plasmid.seq 1017 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1018 >>> len(rc_plasmid.features) 1019 13 1020 1021 Let's compare the first CDS feature - it has gone from being the 1022 second feature (index 1) to the second last feature (index -2), its 1023 strand has changed, and the location switched round. 1024 1025 >>> print plasmid.features[1] 1026 type: CDS 1027 location: [1081:1960] 1028 strand: -1 1029 qualifiers: 1030 Key: label, Value: ['araC'] 1031 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1032 Key: vntifkey, Value: ['4'] 1033 <BLANKLINE> 1034 >>> print rc_plasmid.features[-2] 1035 type: CDS 1036 location: [2963:3842] 1037 strand: 1 1038 qualifiers: 1039 Key: label, Value: ['araC'] 1040 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1041 Key: vntifkey, Value: ['4'] 1042 <BLANKLINE> 1043 1044 You can check this new location, based on the length of the plasmid: 1045 1046 >>> len(plasmid) - 1081 1047 3842 1048 >>> len(plasmid) - 1960 1049 2963 1050 1051 Note that if the SeqFeature annotation includes any strand specific 1052 information (e.g. base changes for a SNP), this information is not 1053 ammended, and would need correction after the reverse complement. 1054 1055 Note trying to reverse complement a protein SeqRecord raises an 1056 exception: 1057 1058 >>> from Bio.SeqRecord import SeqRecord 1059 >>> from Bio.Seq import Seq 1060 >>> from Bio.Alphabet import IUPAC 1061 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1062 >>> protein_rec.reverse_complement() 1063 Traceback (most recent call last): 1064 ... 1065 ValueError: Proteins do not have complements! 1066 1067 Also note you can reverse complement a SeqRecord using a MutableSeq: 1068 1069 >>> from Bio.SeqRecord import SeqRecord 1070 >>> from Bio.Seq import MutableSeq 1071 >>> from Bio.Alphabet import generic_dna 1072 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1073 >>> rec.seq[0] = "T" 1074 >>> print rec.id, rec.seq 1075 Test TCGT 1076 >>> rc = rec.reverse_complement(id=True) 1077 >>> print rc.id, rc.seq 1078 Test ACGA 1079 """ 1080 from Bio.Seq import MutableSeq #Lazy to avoid circular imports 1081 if isinstance(self.seq, MutableSeq): 1082 #Currently the MutableSeq reverse complement is in situ 1083 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1084 else: 1085 answer = SeqRecord(self.seq.reverse_complement()) 1086 if isinstance(id, basestring): 1087 answer.id = id 1088 elif id: 1089 answer.id = self.id 1090 if isinstance(name, basestring): 1091 answer.name = name 1092 elif name: 1093 answer.name = self.name 1094 if isinstance(description, basestring): 1095 answer.description = description 1096 elif description: 1097 answer.description = self.description 1098 if isinstance(dbxrefs, list): 1099 answer.dbxrefs = dbxrefs 1100 elif dbxrefs: 1101 #Copy the old dbxrefs 1102 answer.dbxrefs = self.dbxrefs[:] 1103 if isinstance(features, list): 1104 answer.features = features 1105 elif features: 1106 #Copy the old features, adjusting location and string 1107 l = len(answer) 1108 answer.features = [f._flip(l) for f in self.features] 1109 #The old list should have been sorted by start location, 1110 #reversing it will leave it sorted by what is now the end position, 1111 #so we need to resort in case of overlapping features. 1112 #NOTE - In the common case of gene before CDS (and similar) with 1113 #the exact same locations, this will still maintain gene before CDS 1114 answer.features.sort(key=lambda x : x.location.start.position) 1115 if isinstance(annotations, dict): 1116 answer.annotations = annotations 1117 elif annotations: 1118 #Copy the old annotations, 1119 answer.annotations = self.annotations.copy() 1120 if isinstance(letter_annotations, dict): 1121 answer.letter_annotations = letter_annotations 1122 elif letter_annotations: 1123 #Copy the old per letter annotations, reversing them 1124 for key, value in self.letter_annotations.iteritems(): 1125 answer._per_letter_annotations[key] = value[::-1] 1126 return answer
1127
1128 -def _test():
1129 """Run the Bio.SeqRecord module's doctests (PRIVATE). 1130 1131 This will try and locate the unit tests directory, and run the doctests 1132 from there in order that the relative paths used in the examples work. 1133 """ 1134 import doctest 1135 import os 1136 if os.path.isdir(os.path.join("..","Tests")): 1137 print "Runing doctests..." 1138 cur_dir = os.path.abspath(os.curdir) 1139 os.chdir(os.path.join("..","Tests")) 1140 doctest.testmod() 1141 os.chdir(cur_dir) 1142 del cur_dir 1143 print "Done" 1144 elif os.path.isdir(os.path.join("Tests")): 1145 print "Runing doctests..." 1146 cur_dir = os.path.abspath(os.curdir) 1147 os.chdir(os.path.join("Tests")) 1148 doctest.testmod() 1149 os.chdir(cur_dir) 1150 del cur_dir 1151 print "Done"
1152 1153 if __name__ == "__main__": 1154 _test() 1155