1
2
3
4
5
6
7
8
9 """Sequence input/output as SeqRecord objects.
10
11 Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by
12 a whole chapter in our tutorial:
13 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
14 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
15
16 Input
17 =====
18 The main function is Bio.SeqIO.parse(...) which takes an input file handle
19 (or in recent versions of Biopython alternatively a filename as a string),
20 and format string. This returns an iterator giving SeqRecord objects:
21
22 >>> from Bio import SeqIO
23 >>> for record in SeqIO.parse("Fasta/f002", "fasta"):
24 ... print record.id, len(record)
25 gi|1348912|gb|G26680|G26680 633
26 gi|1348917|gb|G26685|G26685 413
27 gi|1592936|gb|G29385|G29385 471
28
29 Note that the parse() function will invoke the relevant parser for the
30 format with its default settings. You may want more control, in which case
31 you need to create a format specific sequence iterator directly.
32
33 Input - Single Records
34 ======================
35 If you expect your file to contain one-and-only-one record, then we provide
36 the following 'helper' function which will return a single SeqRecord, or
37 raise an exception if there are no records or more than one record:
38
39 >>> from Bio import SeqIO
40 >>> record = SeqIO.read("Fasta/f001", "fasta")
41 >>> print record.id, len(record)
42 gi|3318709|pdb|1A91| 79
43
44 This style is useful when you expect a single record only (and would
45 consider multiple records an error). For example, when dealing with GenBank
46 files for bacterial genomes or chromosomes, there is normally only a single
47 record. Alternatively, use this with a handle when downloading a single
48 record from the internet.
49
50 However, if you just want the first record from a file containing multiple
51 record, use the iterator's next() method:
52
53 >>> from Bio import SeqIO
54 >>> record = SeqIO.parse("Fasta/f002", "fasta").next()
55 >>> print record.id, len(record)
56 gi|1348912|gb|G26680|G26680 633
57
58 The above code will work as long as the file contains at least one record.
59 Note that if there is more than one record, the remaining records will be
60 silently ignored.
61
62
63 Input - Multiple Records
64 ========================
65 For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
66 using a sequence iterator can save you a lot of memory (RAM). There is
67 less benefit for interlaced file formats (e.g. most multiple alignment file
68 formats). However, an iterator only lets you access the records one by one.
69
70 If you want random access to the records by number, turn this into a list:
71
72 >>> from Bio import SeqIO
73 >>> records = list(SeqIO.parse("Fasta/f002", "fasta"))
74 >>> len(records)
75 3
76 >>> print records[1].id
77 gi|1348917|gb|G26685|G26685
78
79 If you want random access to the records by a key such as the record id,
80 turn the iterator into a dictionary:
81
82 >>> from Bio import SeqIO
83 >>> record_dict = SeqIO.to_dict(SeqIO.parse("Fasta/f002", "fasta"))
84 >>> len(record_dict)
85 3
86 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
87 413
88
89 However, using list() or the to_dict() function will load all the records
90 into memory at once, and therefore is not possible on very large files.
91 Instead, for *some* file formats Bio.SeqIO provides an indexing approach
92 providing dictionary like access to any record. For example,
93
94 >>> from Bio import SeqIO
95 >>> record_dict = SeqIO.index("Fasta/f002", "fasta")
96 >>> len(record_dict)
97 3
98 >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
99 413
100
101 Many but not all of the supported input file formats can be indexed like
102 this. For example "fasta", "fastq", "qual" and even the binary format "sff"
103 work, but alignment formats like "phylip", "clustalw" and "nexus" will not.
104
105 In most cases you can also use SeqIO.index to get the record from the file
106 as a raw string (not a SeqRecord). This can be useful for example to extract
107 a sub-set of records from a file where SeqIO cannot output the file format
108 (e.g. the plain text SwissProt format, "swiss") or where it is important to
109 keep the output 100% identical to the input). For example,
110
111 >>> from Bio import SeqIO
112 >>> record_dict = SeqIO.index("Fasta/f002", "fasta")
113 >>> len(record_dict)
114 3
115 >>> print record_dict.get_raw("gi|1348917|gb|G26685|G26685").decode()
116 >gi|1348917|gb|G26685|G26685 human STS STS_D11734.
117 CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATCTTACTCTTTC
118 ATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTTTACAGATGTGAAACTTTCAA
119 GAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAAACCTGATGCTTTTATAAGCCATTGTGATTA
120 GGATGACTGTTACAGGCTTAGCTTTGTGTGAAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGT
121 TCATATTACTNTAAGTTCTATAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGG
122 AATTGNTTTGCCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
123 <BLANKLINE>
124 >>> print record_dict["gi|1348917|gb|G26685|G26685"].format("fasta")
125 >gi|1348917|gb|G26685|G26685 human STS STS_D11734.
126 CGGAGCCAGCGAGCATATGCTGCATGAGGACCTTTCTATCTTACATTATGGCTGGGAATC
127 TTACTCTTTCATCTGATACCTTGTTCAGATTTCAAAATAGTTGTAGCCTTATCCTGGTTT
128 TACAGATGTGAAACTTTCAAGAGATTTACTGACTTTCCTAGAATAGTTTCTCTACTGGAA
129 ACCTGATGCTTTTATAAGCCATTGTGATTAGGATGACTGTTACAGGCTTAGCTTTGTGTG
130 AAANCCAGTCACCTTTCTCCTAGGTAATGAGTAGTGCTGTTCATATTACTNTAAGTTCTA
131 TAGCATACTTGCNATCCTTTANCCATGCTTATCATANGTACCATTTGAGGAATTGNTTTG
132 CCCTTTTGGGTTTNTTNTTGGTAAANNNTTCCCGGGTGGGGGNGGTNNNGAAA
133 <BLANKLINE>
134
135 Here the original file and what Biopython would output differ in the line
136 wrapping. Also note that under Python 3, the get_raw method will return a
137 bytes string, hence the use of decode to turn it into a (unicode) string.
138 This is uncessary on Python 2.
139
140
141 Input - Alignments
142 ==================
143 You can read in alignment files as alignment objects using Bio.AlignIO.
144 Alternatively, reading in an alignment file format via Bio.SeqIO will give
145 you a SeqRecord for each row of each alignment:
146
147 >>> from Bio import SeqIO
148 >>> for record in SeqIO.parse("Clustalw/hedgehog.aln", "clustal"):
149 ... print record.id, len(record)
150 gi|167877390|gb|EDS40773.1| 447
151 gi|167234445|ref|NP_001107837. 447
152 gi|74100009|gb|AAZ99217.1| 447
153 gi|13990994|dbj|BAA33523.2| 447
154 gi|56122354|gb|AAV74328.1| 447
155
156 Output
157 ======
158 Use the function Bio.SeqIO.write(...), which takes a complete set of
159 SeqRecord objects (either as a list, or an iterator), an output file handle
160 (or in recent versions of Biopython an output filename as a string) and of
161 course the file format::
162
163 from Bio import SeqIO
164 records = ...
165 SeqIO.write(records, "example.faa", "fasta")
166
167 Or, using a handle::
168
169 from Bio import SeqIO
170 records = ...
171 handle = open("example.faa", "w")
172 SeqIO.write(records, handle, "fasta")
173 handle.close()
174
175 You are expected to call this function once (with all your records) and if
176 using a handle, make sure you close it to flush the data to the hard disk.
177
178
179 Output - Advanced
180 =================
181 The effect of calling write() multiple times on a single file will vary
182 depending on the file format, and is best avoided unless you have a strong
183 reason to do so.
184
185 If you give a filename, then each time you call write() the existing file
186 will be overwriten. For sequential files formats (e.g. fasta, genbank) each
187 "record block" holds a single sequence. For these files it would probably
188 be safe to call write() multiple times by re-using the same handle.
189
190
191 However, trying this for certain alignment formats (e.g. phylip, clustal,
192 stockholm) would have the effect of concatenating several multiple sequence
193 alignments together. Such files are created by the PHYLIP suite of programs
194 for bootstrap analysis, but it is clearer to do this via Bio.AlignIO instead.
195
196
197 Conversion
198 ==========
199 The Bio.SeqIO.convert(...) function allows an easy interface for simple
200 file format conversions. Additionally, it may use file format specific
201 optimisations so this should be the fastest way too.
202
203 In general however, you can combine the Bio.SeqIO.parse(...) function with
204 the Bio.SeqIO.write(...) function for sequence file conversion. Using
205 generator expressions or generator functions provides a memory efficient way
206 to perform filtering or other extra operations as part of the process.
207
208
209 File Formats
210 ============
211 When specifying the file format, use lowercase strings. The same format
212 names are also used in Bio.AlignIO and include the following:
213
214 - abif - Applied Biosystem's sequencing trace format
215 - ace - Reads the contig sequences from an ACE assembly file.
216 - embl - The EMBL flat file format. Uses Bio.GenBank internally.
217 - fasta - The generic sequence file format where each record starts with
218 an identifer line starting with a ">" character, followed by
219 lines of sequence.
220 - fastq - A "FASTA like" format used by Sanger which also stores PHRED
221 sequence quality values (with an ASCII offset of 33).
222 - fastq-sanger - An alias for "fastq" for consistency with BioPerl and EMBOSS
223 - fastq-solexa - Original Solexa/Illumnia variant of the FASTQ format which
224 encodes Solexa quality scores (not PHRED quality scores) with an
225 ASCII offset of 64.
226 - fastq-illumina - Solexa/Illumina 1.3 to 1.7 variant of the FASTQ format
227 which encodes PHRED quality scores with an ASCII offset of 64
228 (not 33). Note as of version 1.8 of the CASAVA pipeline Illumina
229 will produce FASTQ files using the standard Sanger encoding.
230 - genbank - The GenBank or GenPept flat file format.
231 - gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
232 - ig - The IntelliGenetics file format, apparently the same as the
233 MASE alignment format.
234 - imgt - An EMBL like format from IMGT where the feature tables are more
235 indented to allow for longer feature types.
236 - phd - Output from PHRED, used by PHRAP and CONSED for input.
237 - pir - A "FASTA like" format introduced by the National Biomedical
238 Research Foundation (NBRF) for the Protein Information Resource
239 (PIR) database, now part of UniProt.
240 - seqxml - SeqXML, simple XML format described in Schmitt et al (2011).
241 - sff - Standard Flowgram Format (SFF), typical output from Roche 454.
242 - sff-trim - Standard Flowgram Format (SFF) with given trimming applied.
243 - swiss - Plain text Swiss-Prot aka UniProt format.
244 - tab - Simple two column tab separated sequence files, where each
245 line holds a record's identifier and sequence. For example,
246 this is used as by Aligent's eArray software when saving
247 microarray probes in a minimal tab delimited text file.
248 - qual - A "FASTA like" format holding PHRED quality values from
249 sequencing DNA, but no actual sequences (usually provided
250 in separate FASTA files).
251 - uniprot-xml - The UniProt XML format (replacement for the SwissProt plain
252 text format which we call "swiss")
253
254 Note that while Bio.SeqIO can read all the above file formats, it cannot
255 write to all of them.
256
257 You can also use any file format supported by Bio.AlignIO, such as "nexus",
258 "phlip" and "stockholm", which gives you access to the individual sequences
259 making up each alignment as SeqRecords.
260 """
261 __docformat__ = "epytext en"
262
263
264
265
266
267
268
269
270
271
272
273
274 """
275 FAO BioPython Developers
276 ========================
277 The way I envision this SeqIO system working as that for any sequence file
278 format we have an iterator that returns SeqRecord objects.
279
280 This also applies to interlaced fileformats (like clustal - although that
281 is now handled via Bio.AlignIO instead) where the file cannot be read record
282 by record. You should still return an iterator, even if the implementation
283 could just as easily return a list.
284
285 These file format specific sequence iterators may be implemented as:
286 * Classes which take a handle for __init__ and provide the __iter__ method
287 * Functions that take a handle, and return an iterator object
288 * Generator functions that take a handle, and yield SeqRecord objects
289
290 It is then trivial to turn this iterator into a list of SeqRecord objects,
291 an in memory dictionary, or a multiple sequence alignment object.
292
293 For building the dictionary by default the id propery of each SeqRecord is
294 used as the key. You should always populate the id property, and it should
295 be unique in most cases. For some file formats the accession number is a good
296 choice. If the file itself contains ambiguous identifiers, don't try and
297 dis-ambiguate them - return them as is.
298
299 When adding a new file format, please use the same lower case format name
300 as BioPerl, or if they have not defined one, try the names used by EMBOSS.
301
302 See also http://biopython.org/wiki/SeqIO_dev
303
304 --Peter
305 """
306
307 from Bio.Seq import Seq
308 from Bio.SeqRecord import SeqRecord
309 from Bio.Align import MultipleSeqAlignment
310 from Bio.Align.Generic import Alignment
311 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
312
313 import AbiIO
314 import AceIO
315 import FastaIO
316 import IgIO
317 import InsdcIO
318 import PhdIO
319 import PirIO
320 import SeqXmlIO
321 import SffIO
322 import SwissIO
323 import TabIO
324 import QualityIO
325 import UniprotIO
326
327
328
329
330
331
332
333
334
335
336
337 _FormatToIterator = {"fasta" : FastaIO.FastaIterator,
338 "gb" : InsdcIO.GenBankIterator,
339 "genbank" : InsdcIO.GenBankIterator,
340 "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
341 "embl" : InsdcIO.EmblIterator,
342 "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
343 "imgt" : InsdcIO.ImgtIterator,
344 "ig" : IgIO.IgIterator,
345 "swiss" : SwissIO.SwissIterator,
346 "phd" : PhdIO.PhdIterator,
347 "ace" : AceIO.AceIterator,
348 "tab" : TabIO.TabIterator,
349 "pir" : PirIO.PirIterator,
350 "fastq" : QualityIO.FastqPhredIterator,
351 "fastq-sanger" : QualityIO.FastqPhredIterator,
352 "fastq-solexa" : QualityIO.FastqSolexaIterator,
353 "fastq-illumina" : QualityIO.FastqIlluminaIterator,
354 "qual" : QualityIO.QualPhredIterator,
355 "sff": SffIO.SffIterator,
356
357 "sff-trim": SffIO._SffTrimIterator,
358 "uniprot-xml": UniprotIO.UniprotIterator,
359 "seqxml" : SeqXmlIO.SeqXmlIterator,
360 "abi": AbiIO.AbiIterator,
361 "abi-trim": AbiIO._AbiTrimIterator,
362 }
363
364 _FormatToWriter = {"fasta" : FastaIO.FastaWriter,
365 "gb" : InsdcIO.GenBankWriter,
366 "genbank" : InsdcIO.GenBankWriter,
367 "embl" : InsdcIO.EmblWriter,
368 "imgt" : InsdcIO.ImgtWriter,
369 "tab" : TabIO.TabWriter,
370 "fastq" : QualityIO.FastqPhredWriter,
371 "fastq-sanger" : QualityIO.FastqPhredWriter,
372 "fastq-solexa" : QualityIO.FastqSolexaWriter,
373 "fastq-illumina" : QualityIO.FastqIlluminaWriter,
374 "phd" : PhdIO.PhdWriter,
375 "qual" : QualityIO.QualPhredWriter,
376 "sff" : SffIO.SffWriter,
377 "seqxml" : SeqXmlIO.SeqXmlWriter,
378 }
379
380 _BinaryFormats = ["sff", "sff-trim", "abi", "abi-trim"]
381
382 -def write(sequences, handle, format):
383 """Write complete set of sequences to a file.
384
385 - sequences - A list (or iterator) of SeqRecord objects, or (if using
386 Biopython 1.54 or later) a single SeqRecord.
387 - handle - File handle object to write to, or filename as string
388 (note older versions of Biopython only took a handle).
389 - format - lower case string describing the file format to write.
390
391 You should close the handle after calling this function.
392
393 Returns the number of records written (as an integer).
394 """
395 from Bio import AlignIO
396
397
398 if not isinstance(format, basestring):
399 raise TypeError("Need a string for the file format (lower case)")
400 if not format:
401 raise ValueError("Format required (lower case string)")
402 if format != format.lower():
403 raise ValueError("Format string '%s' should be lower case" % format)
404
405 if isinstance(sequences, SeqRecord):
406
407 sequences = [sequences]
408
409 if isinstance(handle, basestring):
410 if format in _BinaryFormats :
411 handle = open(handle, "wb")
412 else :
413 handle = open(handle, "w")
414 handle_close = True
415 else:
416 handle_close = False
417
418
419 if format in _FormatToWriter:
420 writer_class = _FormatToWriter[format]
421 count = writer_class(handle).write_file(sequences)
422 elif format in AlignIO._FormatToWriter:
423
424
425 alignment = MultipleSeqAlignment(sequences)
426 alignment_count = AlignIO.write([alignment], handle, format)
427 assert alignment_count == 1, "Internal error - the underlying writer " \
428 + " should have returned 1, not %s" % repr(alignment_count)
429 count = len(alignment)
430 del alignment_count, alignment
431 elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
432 raise ValueError("Reading format '%s' is supported, but not writing" \
433 % format)
434 else:
435 raise ValueError("Unknown format '%s'" % format)
436
437 assert isinstance(count, int), "Internal error - the underlying %s " \
438 "writer should have returned the record count, not %s" \
439 % (format, repr(count))
440
441 if handle_close:
442 handle.close()
443
444 return count
445
446 -def parse(handle, format, alphabet=None):
447 r"""Turns a sequence file into an iterator returning SeqRecords.
448
449 - handle - handle to the file, or the filename as a string
450 (note older verions of Biopython only took a handle).
451 - format - lower case string describing the file format.
452 - alphabet - optional Alphabet object, useful when the sequence type
453 cannot be automatically inferred from the file itself
454 (e.g. format="fasta" or "tab")
455
456 Typical usage, opening a file to read in, and looping over the record(s):
457
458 >>> from Bio import SeqIO
459 >>> filename = "Fasta/sweetpea.nu"
460 >>> for record in SeqIO.parse(filename, "fasta"):
461 ... print "ID", record.id
462 ... print "Sequence length", len(record)
463 ... print "Sequence alphabet", record.seq.alphabet
464 ID gi|3176602|gb|U78617.1|LOU78617
465 Sequence length 309
466 Sequence alphabet SingleLetterAlphabet()
467
468 For file formats like FASTA where the alphabet cannot be determined, it
469 may be useful to specify the alphabet explicitly:
470
471 >>> from Bio import SeqIO
472 >>> from Bio.Alphabet import generic_dna
473 >>> filename = "Fasta/sweetpea.nu"
474 >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
475 ... print "ID", record.id
476 ... print "Sequence length", len(record)
477 ... print "Sequence alphabet", record.seq.alphabet
478 ID gi|3176602|gb|U78617.1|LOU78617
479 Sequence length 309
480 Sequence alphabet DNAAlphabet()
481
482 If you have a string 'data' containing the file contents, you must
483 first turn this into a handle in order to parse it:
484
485 >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
486 >>> from Bio import SeqIO
487 >>> from StringIO import StringIO
488 >>> for record in SeqIO.parse(StringIO(data), "fasta"):
489 ... print record.id, record.seq
490 Alpha ACCGGATGTA
491 Beta AGGCTCGGTTA
492
493 Use the Bio.SeqIO.read(...) function when you expect a single record
494 only.
495 """
496
497
498
499 from Bio import AlignIO
500
501 handle_close = False
502
503 if isinstance(handle, basestring):
504
505 if format in _BinaryFormats :
506 handle = open(handle, "rb")
507 else :
508 handle = open(handle, "rU")
509
510 handle_close = True
511
512
513 if not isinstance(format, basestring):
514 raise TypeError("Need a string for the file format (lower case)")
515 if not format:
516 raise ValueError("Format required (lower case string)")
517 if format != format.lower():
518 raise ValueError("Format string '%s' should be lower case" % format)
519 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
520 isinstance(alphabet, AlphabetEncoder)):
521 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
522
523
524 if format in _FormatToIterator:
525 iterator_generator = _FormatToIterator[format]
526 if alphabet is None:
527 i = iterator_generator(handle)
528 else:
529 try:
530 i = iterator_generator(handle, alphabet=alphabet)
531 except TypeError:
532 i = _force_alphabet(iterator_generator(handle), alphabet)
533 elif format in AlignIO._FormatToIterator:
534
535
536
537 i = _iterate_via_AlignIO(handle, format, alphabet)
538 else:
539 raise ValueError("Unknown format '%s'" % format)
540
541 for r in i:
542 yield r
543 if handle_close:
544 handle.close()
545
546
553
555 """Iterate over records, over-riding the alphabet (PRIVATE)."""
556
557 given_base_class = _get_base_alphabet(alphabet).__class__
558 for record in record_iterator:
559 if isinstance(_get_base_alphabet(record.seq.alphabet),
560 given_base_class):
561 record.seq.alphabet = alphabet
562 yield record
563 else:
564 raise ValueError("Specified alphabet %s clashes with "\
565 "that determined from the file, %s" \
566 % (repr(alphabet), repr(record.seq.alphabet)))
567
568 -def read(handle, format, alphabet=None):
569 """Turns a sequence file into a single SeqRecord.
570
571 - handle - handle to the file, or the filename as a string
572 (note older verions of Biopython only took a handle).
573 - format - string describing the file format.
574 - alphabet - optional Alphabet object, useful when the sequence type
575 cannot be automatically inferred from the file itself
576 (e.g. format="fasta" or "tab")
577
578 This function is for use parsing sequence files containing
579 exactly one record. For example, reading a GenBank file:
580
581 >>> from Bio import SeqIO
582 >>> record = SeqIO.read("GenBank/arab1.gb", "genbank")
583 >>> print "ID", record.id
584 ID AC007323.5
585 >>> print "Sequence length", len(record)
586 Sequence length 86436
587 >>> print "Sequence alphabet", record.seq.alphabet
588 Sequence alphabet IUPACAmbiguousDNA()
589
590 If the handle contains no records, or more than one record,
591 an exception is raised. For example:
592
593 >>> from Bio import SeqIO
594 >>> record = SeqIO.read("GenBank/cor6_6.gb", "genbank")
595 Traceback (most recent call last):
596 ...
597 ValueError: More than one record found in handle
598
599 If however you want the first record from a file containing
600 multiple records this function would raise an exception (as
601 shown in the example above). Instead use:
602
603 >>> from Bio import SeqIO
604 >>> record = SeqIO.parse("GenBank/cor6_6.gb", "genbank").next()
605 >>> print "First record's ID", record.id
606 First record's ID X55053.1
607
608 Use the Bio.SeqIO.parse(handle, format) function if you want
609 to read multiple records from the handle.
610 """
611 iterator = parse(handle, format, alphabet)
612 try:
613 first = iterator.next()
614 except StopIteration:
615 first = None
616 if first is None:
617 raise ValueError("No records found in handle")
618 try:
619 second = iterator.next()
620 except StopIteration:
621 second = None
622 if second is not None:
623 raise ValueError("More than one record found in handle")
624 return first
625
626 -def to_dict(sequences, key_function=None):
627 """Turns a sequence iterator or list into a dictionary.
628
629 - sequences - An iterator that returns SeqRecord objects,
630 or simply a list of SeqRecord objects.
631 - key_function - Optional callback function which when given a
632 SeqRecord should return a unique key for the dictionary.
633
634 e.g. key_function = lambda rec : rec.name
635 or, key_function = lambda rec : rec.description.split()[0]
636
637 If key_function is ommitted then record.id is used, on the assumption
638 that the records objects returned are SeqRecords with a unique id.
639
640 If there are duplicate keys, an error is raised.
641
642 Example usage, defaulting to using the record.id as key:
643
644 >>> from Bio import SeqIO
645 >>> filename = "GenBank/cor6_6.gb"
646 >>> format = "genbank"
647 >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format))
648 >>> print sorted(id_dict)
649 ['AF297471.1', 'AJ237582.1', 'L31939.1', 'M81224.1', 'X55053.1', 'X62281.1']
650 >>> print id_dict["L31939.1"].description
651 Brassica rapa (clone bif72) kin mRNA, complete cds.
652
653 A more complex example, using the key_function argument in order to
654 use a sequence checksum as the dictionary key:
655
656 >>> from Bio import SeqIO
657 >>> from Bio.SeqUtils.CheckSum import seguid
658 >>> filename = "GenBank/cor6_6.gb"
659 >>> format = "genbank"
660 >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format),
661 ... key_function = lambda rec : seguid(rec.seq))
662 >>> for key, record in sorted(seguid_dict.iteritems()):
663 ... print key, record.id
664 /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
665 BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
666 SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
667 TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
668 l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
669 uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
670
671 This approach is not suitable for very large sets of sequences, as all
672 the SeqRecord objects are held in memory. Instead, consider using the
673 Bio.SeqIO.index() function (if it supports your particular file format).
674 """
675 if key_function is None:
676 key_function = lambda rec : rec.id
677
678 d = dict()
679 for record in sequences:
680 key = key_function(record)
681 if key in d:
682 raise ValueError("Duplicate key '%s'" % key)
683 d[key] = record
684 return d
685
686 -def index(filename, format, alphabet=None, key_function=None):
687 """Indexes a sequence file and returns a dictionary like object.
688
689 - filename - string giving name of file to be indexed
690 - format - lower case string describing the file format
691 - alphabet - optional Alphabet object, useful when the sequence type
692 cannot be automatically inferred from the file itself
693 (e.g. format="fasta" or "tab")
694 - key_function - Optional callback function which when given a
695 SeqRecord identifier string should return a unique
696 key for the dictionary.
697
698 This indexing function will return a dictionary like object, giving the
699 SeqRecord objects as values:
700
701 >>> from Bio import SeqIO
702 >>> records = SeqIO.index("Quality/example.fastq", "fastq")
703 >>> len(records)
704 3
705 >>> sorted(records)
706 ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
707 >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
708 >EAS54_6_R1_2_1_540_792
709 TTGGCAGGCCAAGGCCGATGGATCA
710 <BLANKLINE>
711 >>> "EAS54_6_R1_2_1_540_792" in records
712 True
713 >>> print records.get("Missing", None)
714 None
715
716 Note that this psuedo dictionary will not support all the methods of a
717 true Python dictionary, for example values() is not defined since this
718 would require loading all of the records into memory at once.
719
720 When you call the index function, it will scan through the file, noting
721 the location of each record. When you access a particular record via the
722 dictionary methods, the code will jump to the appropriate part of the
723 file and then parse that section into a SeqRecord.
724
725 Note that not all the input formats supported by Bio.SeqIO can be used
726 with this index function. It is designed to work only with sequential
727 file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
728 interlaced file format (e.g. alignment formats such as "clustal").
729
730 For small files, it may be more efficient to use an in memory Python
731 dictionary, e.g.
732
733 >>> from Bio import SeqIO
734 >>> records = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
735 >>> len(records)
736 3
737 >>> sorted(records)
738 ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
739 >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
740 >EAS54_6_R1_2_1_540_792
741 TTGGCAGGCCAAGGCCGATGGATCA
742 <BLANKLINE>
743
744 As with the to_dict() function, by default the id string of each record
745 is used as the key. You can specify a callback function to transform
746 this (the record identifier string) into your prefered key. For example:
747
748 >>> from Bio import SeqIO
749 >>> def make_tuple(identifier):
750 ... parts = identifier.split("_")
751 ... return int(parts[-2]), int(parts[-1])
752 >>> records = SeqIO.index("Quality/example.fastq", "fastq",
753 ... key_function=make_tuple)
754 >>> len(records)
755 3
756 >>> sorted(records)
757 [(413, 324), (443, 348), (540, 792)]
758 >>> print records[(540, 792)].format("fasta")
759 >EAS54_6_R1_2_1_540_792
760 TTGGCAGGCCAAGGCCGATGGATCA
761 <BLANKLINE>
762 >>> (540, 792) in records
763 True
764 >>> "EAS54_6_R1_2_1_540_792" in records
765 False
766 >>> print records.get("Missing", None)
767 None
768
769 Another common use case would be indexing an NCBI style FASTA file,
770 where you might want to extract the GI number from the FASTA identifer
771 to use as the dictionary key.
772
773 Notice that unlike the to_dict() function, here the key_function does
774 not get given the full SeqRecord to use to generate the key. Doing so
775 would impose a severe performance penalty as it would require the file
776 to be completely parsed while building the index. Right now this is
777 usually avoided.
778
779 See also: Bio.SeqIO.index_db() and Bio.SeqIO.to_dict()
780 """
781
782 if not isinstance(filename, basestring):
783 raise TypeError("Need a filename (not a handle)")
784 if not isinstance(format, basestring):
785 raise TypeError("Need a string for the file format (lower case)")
786 if not format:
787 raise ValueError("Format required (lower case string)")
788 if format != format.lower():
789 raise ValueError("Format string '%s' should be lower case" % format)
790 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
791 isinstance(alphabet, AlphabetEncoder)):
792 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
793
794
795 import _index
796 return _index._IndexedSeqFileDict(filename, format, alphabet, key_function)
797
798 -def index_db(index_filename, filenames=None, format=None, alphabet=None,
799 key_function=None):
800 """Index several sequence files and return a dictionary like object.
801
802 The index is stored in an SQLite database rather than in memory (as in the
803 Bio.SeqIO.index(...) function).
804
805 - index_filename - Where to store the SQLite index
806 - filenames - list of strings specifying file(s) to be indexed, or when
807 indexing a single file this can be given as a string.
808 (optional if reloading an existing index, but must match)
809 - format - lower case string describing the file format
810 (optional if reloading an existing index, but must match)
811 - alphabet - optional Alphabet object, useful when the sequence type
812 cannot be automatically inferred from the file itself
813 (e.g. format="fasta" or "tab")
814 - key_function - Optional callback function which when given a
815 SeqRecord identifier string should return a unique
816 key for the dictionary.
817
818 This indexing function will return a dictionary like object, giving the
819 SeqRecord objects as values:
820
821 >>> from Bio.Alphabet import generic_protein
822 >>> from Bio import SeqIO
823 >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
824 >>> def get_gi(name):
825 ... parts = name.split("|")
826 ... i = parts.index("gi")
827 ... assert i != -1
828 ... return parts[i+1]
829 >>> idx_name = ":memory:" #use an in memory SQLite DB for this test
830 >>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi)
831 >>> len(records)
832 95
833 >>> records["7525076"].description
834 'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]'
835 >>> records["45478717"].description
836 'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]'
837
838 In this example the two files contain 85 and 10 records respectively.
839
840 See also: Bio.SeqIO.index() and Bio.SeqIO.to_dict()
841 """
842
843 if not isinstance(index_filename, basestring):
844 raise TypeError("Need a string for the index filename")
845 if isinstance(filenames, basestring):
846
847
848 filenames = [filenames]
849 if filenames is not None and not isinstance(filenames, list):
850 raise TypeError("Need a list of filenames (as strings), or one filename")
851 if format is not None and not isinstance(format, basestring):
852 raise TypeError("Need a string for the file format (lower case)")
853 if format and format != format.lower():
854 raise ValueError("Format string '%s' should be lower case" % format)
855 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
856 isinstance(alphabet, AlphabetEncoder)):
857 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
858
859
860 import _index
861 return _index._SQLiteManySeqFilesDict(index_filename, filenames, format,
862 alphabet, key_function)
863
864
865 -def convert(in_file, in_format, out_file, out_format, alphabet=None):
866 """Convert between two sequence file formats, return number of records.
867
868 - in_file - an input handle or filename
869 - in_format - input file format, lower case string
870 - out_file - an output handle or filename
871 - out_format - output file format, lower case string
872 - alphabet - optional alphabet to assume
873
874 NOTE - If you provide an output filename, it will be opened which will
875 overwrite any existing file without warning. This may happen if even
876 the conversion is aborted (e.g. an invalid out_format name is given).
877
878 For example, going from a filename to a handle:
879
880 >>> from Bio import SeqIO
881 >>> from StringIO import StringIO
882 >>> handle = StringIO("")
883 >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
884 3
885 >>> print handle.getvalue()
886 >EAS54_6_R1_2_1_413_324
887 CCCTTCTTGTCTTCAGCGTTTCTCC
888 >EAS54_6_R1_2_1_540_792
889 TTGGCAGGCCAAGGCCGATGGATCA
890 >EAS54_6_R1_2_1_443_348
891 GTTGCTTCTGGCGTGGGTGGGGGGG
892 <BLANKLINE>
893 """
894 if isinstance(in_file, basestring):
895
896 if in_format in _BinaryFormats :
897 in_handle = open(in_file, "rb")
898 else :
899 in_handle = open(in_file, "rU")
900 in_close = True
901 else:
902 in_handle = in_file
903 in_close = False
904
905 if isinstance(out_file, basestring):
906 if out_format in ["sff", "sff_trim"] :
907 out_handle = open(out_file, "wb")
908 else :
909 out_handle = open(out_file, "w")
910 out_close = True
911 else:
912 out_handle = out_file
913 out_close = False
914
915
916 from _convert import _handle_convert
917 count = _handle_convert(in_handle, in_format,
918 out_handle, out_format,
919 alphabet)
920
921 if in_close:
922 in_handle.close()
923 if out_close:
924 out_handle.close()
925 return count
926
928 """Run the Bio.SeqIO module's doctests.
929
930 This will try and locate the unit tests directory, and run the doctests
931 from there in order that the relative paths used in the examples work.
932 """
933 import doctest
934 import os
935 if os.path.isdir(os.path.join("..", "..", "Tests")):
936 print "Runing doctests..."
937 cur_dir = os.path.abspath(os.curdir)
938 os.chdir(os.path.join("..", "..", "Tests"))
939 doctest.testmod()
940 os.chdir(cur_dir)
941 del cur_dir
942 print "Done"
943 elif os.path.isdir(os.path.join("Tests", "Fasta")):
944 print "Runing doctests..."
945 cur_dir = os.path.abspath(os.curdir)
946 os.chdir(os.path.join("Tests"))
947 doctest.testmod()
948 os.chdir(cur_dir)
949 del cur_dir
950 print "Done"
951
952 if __name__ == "__main__":
953
954 _test()
955