1
2
3
4
5 """Optimised sequence conversion code (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.convert(...) function which is the
9 public interface for this.
10
11 The idea here is rather while doing this will work:
12
13 from Bio import SeqIO
14 records = SeqIO.parse(in_handle, in_format)
15 count = SeqIO.write(records, out_handle, out_format)
16
17 it is shorter to write:
18
19 from Bio import SeqIO
20 count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
21
22 Also, the convert function can take a number of special case optimisations. This
23 means that using Bio.SeqIO.convert() may be faster, as well as more convenient.
24 All these file format specific optimisations are handled by this (private) module.
25 """
26
27 from Bio import SeqIO
28
29
37
38
46
47
62
63
64 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
65 """FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
66 from Bio.SeqIO.QualityIO import FastqGeneralIterator
67
68 count = 0
69 null = chr(0)
70 for title, seq, old_qual in FastqGeneralIterator(in_handle):
71 count += 1
72
73 qual = old_qual.translate(mapping)
74 if null in qual:
75 raise ValueError("Invalid character in quality string")
76 if truncate_char in qual:
77 qual = qual.replace(truncate_char, chr(126))
78 import warnings
79 warnings.warn(truncate_msg)
80 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
81 return count
82
83
85 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
86
87 Useful for removing line wrapping and the redundant second identifier
88 on the plus lines. Will check also check the quality string is valid.
89
90 Avoids creating SeqRecord and Seq objects in order to speed up this
91 conversion.
92 """
93
94 mapping = "".join([chr(0) for ascii in range(0, 33)] \
95 +[chr(ascii) for ascii in range(33, 127)] \
96 +[chr(0) for ascii in range(127, 256)])
97 assert len(mapping)==256
98 return _fastq_generic(in_handle, out_handle, mapping)
99
100
102 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
103
104 Useful for removing line wrapping and the redundant second identifier
105 on the plus lines. Will check also check the quality string is valid.
106 Avoids creating SeqRecord and Seq objects in order to speed up this
107 conversion.
108 """
109
110 mapping = "".join([chr(0) for ascii in range(0, 59)] \
111 +[chr(ascii) for ascii in range(59, 127)] \
112 +[chr(0) for ascii in range(127, 256)])
113 assert len(mapping)==256
114 return _fastq_generic(in_handle, out_handle, mapping)
115
116
118 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
119
120 Useful for removing line wrapping and the redundant second identifier
121 on the plus lines. Will check also check the quality string is valid.
122 Avoids creating SeqRecord and Seq objects in order to speed up this
123 conversion.
124 """
125
126 mapping = "".join([chr(0) for ascii in range(0, 64)] \
127 +[chr(ascii) for ascii in range(64,127)] \
128 +[chr(0) for ascii in range(127,256)])
129 assert len(mapping)==256
130 return _fastq_generic(in_handle, out_handle, mapping)
131
132
134 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
135
136 Avoids creating SeqRecord and Seq objects in order to speed up this
137 conversion.
138 """
139
140 mapping = "".join([chr(0) for ascii in range(0, 64)] \
141 +[chr(33+q) for q in range(0, 62+1)] \
142 +[chr(0) for ascii in range(127, 256)])
143 assert len(mapping)==256
144 return _fastq_generic(in_handle, out_handle, mapping)
145
146
148 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
149
150 Avoids creating SeqRecord and Seq objects in order to speed up this
151 conversion. Will issue a warning if the scores had to be truncated at 62
152 (maximum possible in the Illumina 1.3+ FASTQ format)
153 """
154
155 trunc_char = chr(1)
156 mapping = "".join([chr(0) for ascii in range(0, 33)] \
157 +[chr(64+q) for q in range(0, 62+1) ] \
158 +[trunc_char for ascii in range(96,127)] \
159 +[chr(0) for ascii in range(127, 256)])
160 assert len(mapping)==256
161 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
162 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
163
164
166 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE).
167
168 Avoids creating SeqRecord and Seq objects in order to speed up this
169 conversion.
170 """
171
172 from Bio.SeqIO.QualityIO import phred_quality_from_solexa
173 mapping = "".join([chr(0) for ascii in range(0, 59)] \
174 +[chr(33+int(round(phred_quality_from_solexa(q)))) \
175 for q in range(-5, 62+1)]\
176 +[chr(0) for ascii in range(127, 256)])
177 assert len(mapping)==256
178 return _fastq_generic(in_handle, out_handle, mapping)
179
181 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
182
183 Avoids creating SeqRecord and Seq objects in order to speed up this
184 conversion. Will issue a warning if the scores had to be truncated at 62
185 (maximum possible in the Solexa FASTQ format)
186 """
187
188 from Bio.SeqIO.QualityIO import solexa_quality_from_phred
189 trunc_char = chr(1)
190 mapping = "".join([chr(0) for ascii in range(0, 33)] \
191 +[chr(64+int(round(solexa_quality_from_phred(q)))) \
192 for q in range(0, 62+1)] \
193 +[trunc_char for ascii in range(96, 127)] \
194 +[chr(0) for ascii in range(127, 256)])
195 assert len(mapping)==256
196 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
197 "Data loss - max Solexa quality 62 in Solexa FASTQ")
198
199
201 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
202
203 Avoids creating SeqRecord and Seq objects in order to speed up this
204 conversion.
205 """
206
207 from Bio.SeqIO.QualityIO import phred_quality_from_solexa
208 mapping = "".join([chr(0) for ascii in range(0, 59)] \
209 +[chr(64+int(round(phred_quality_from_solexa(q)))) \
210 for q in range(-5, 62+1)]\
211 +[chr(0) for ascii in range(127, 256)])
212 assert len(mapping)==256
213 return _fastq_generic(in_handle, out_handle, mapping)
214
215
217 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE).
218
219 Avoids creating SeqRecord and Seq objects in order to speed up this
220 conversion.
221 """
222
223 from Bio.SeqIO.QualityIO import solexa_quality_from_phred
224 trunc_char = chr(1)
225 mapping = "".join([chr(0) for ascii in range(0, 64)] \
226 +[chr(64+int(round(solexa_quality_from_phred(q)))) \
227 for q in range(0, 62+1)] \
228 +[chr(0) for ascii in range(127, 256)])
229 assert len(mapping)==256
230 return _fastq_generic(in_handle, out_handle, mapping)
231
232
234 """Fast FASTQ to FASTA conversion (PRIVATE).
235
236 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
237 Seq objects in order to speed up this conversion.
238
239 NOTE - This does NOT check the characters used in the FASTQ quality string
240 are valid!
241 """
242 from Bio.SeqIO.QualityIO import FastqGeneralIterator
243
244 count = 0
245 for title, seq, qual in FastqGeneralIterator(in_handle):
246 count += 1
247 out_handle.write(">%s\n" % title)
248
249 for i in range(0, len(seq), 60):
250 out_handle.write(seq[i:i+60] + "\n")
251 return count
252
254 """Fast FASTQ to simple tabbed conversion (PRIVATE).
255
256 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
257 Seq objects in order to speed up this conversion.
258
259 NOTE - This does NOT check the characters used in the FASTQ quality string
260 are valid!
261 """
262 from Bio.SeqIO.QualityIO import FastqGeneralIterator
263
264 count = 0
265 for title, seq, qual in FastqGeneralIterator(in_handle):
266 count += 1
267 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
268 return count
269
271 """FASTQ helper function for QUAL output (PRIVATE).
272
273 Mapping should be a dictionary mapping expected ASCII characters from the
274 FASTQ quality string to PHRED quality scores (as strings).
275 """
276 from Bio.SeqIO.QualityIO import FastqGeneralIterator
277
278 count = 0
279 for title, seq, qual in FastqGeneralIterator(in_handle):
280 count += 1
281 out_handle.write(">%s\n" % title)
282
283 try:
284 qualities_strs = [mapping[ascii] for ascii in qual]
285 except KeyError:
286 raise ValueError("Invalid character in quality string")
287 data = " ".join(qualities_strs)
288 while True:
289 if len(data) <= 60:
290 out_handle.write(data + "\n")
291 break
292 else:
293
294
295 i = data.rfind(" ", 0, 60)
296 out_handle.write(data[:i] + "\n")
297 data = data[i+1:]
298 return count
299
300
302 """Fast Sanger FASTQ to QUAL conversion (PRIVATE)."""
303 mapping = dict((chr(q+33), str(q)) for q in range(0,93+1))
304 return _fastq_convert_qual(in_handle, out_handle, mapping)
305
306
313
314
316 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE)."""
317 mapping = dict((chr(q+64), str(q)) for q in range(0,62+1))
318 return _fastq_convert_qual(in_handle, out_handle, mapping)
319
320
321
322 _converter = {
323 ("genbank", "fasta") : _genbank_convert_fasta,
324 ("gb", "fasta") : _genbank_convert_fasta,
325 ("embl", "fasta") : _embl_convert_fasta,
326 ("fastq", "fasta") : _fastq_convert_fasta,
327 ("fastq-sanger", "fasta") : _fastq_convert_fasta,
328 ("fastq-solexa", "fasta") : _fastq_convert_fasta,
329 ("fastq-illumina", "fasta") : _fastq_convert_fasta,
330 ("fastq", "tab") : _fastq_convert_tab,
331 ("fastq-sanger", "tab") : _fastq_convert_tab,
332 ("fastq-solexa", "tab") : _fastq_convert_tab,
333 ("fastq-illumina", "tab") : _fastq_convert_tab,
334 ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger,
335 ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger,
336 ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger,
337 ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger,
338 ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
339 ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
340 ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger,
341 ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger,
342 ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
343 ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
344 ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa,
345 ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa,
346 ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
347 ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
348 ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina,
349 ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina,
350 ("fastq", "qual") : _fastq_sanger_convert_qual,
351 ("fastq-sanger", "qual") : _fastq_sanger_convert_qual,
352 ("fastq-solexa", "qual") : _fastq_solexa_convert_qual,
353 ("fastq-illumina", "qual") : _fastq_illumina_convert_qual,
354 }
355
356 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
357 """SeqIO conversion function (PRIVATE)."""
358 try:
359 f = _converter[(in_format, out_format)]
360 except KeyError:
361 f = None
362 if f:
363 return f(in_handle, out_handle, alphabet)
364 else:
365 records = SeqIO.parse(in_handle, in_format, alphabet)
366 return SeqIO.write(records, out_handle, out_format)
367