Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Optimised sequence conversion code (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
  9  public interface for this. 
 10   
 11  The idea here is rather while doing this will work: 
 12   
 13  from Bio import SeqIO 
 14  records = SeqIO.parse(in_handle, in_format) 
 15  count = SeqIO.write(records, out_handle, out_format) 
 16   
 17  it is shorter to write: 
 18   
 19  from Bio import SeqIO 
 20  count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 21   
 22  Also, the convert function can take a number of special case optimisations. This 
 23  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 24  All these file format specific optimisations are handled by this (private) module. 
 25  """ 
 26   
 27  from Bio import SeqIO 
 28  #NOTE - Lots of lazy imports further on... 
 29   
30 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
31 """Fast GenBank to FASTA (PRIVATE).""" 32 #We don't need to parse the features... 33 from Bio.GenBank.Scanner import GenBankScanner 34 records = GenBankScanner().parse_records(in_handle, do_features=False) 35 #For FASTA output we can ignore the alphabet too 36 return SeqIO.write(records, out_handle, "fasta")
37 38
39 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
40 """Fast EMBL to FASTA (PRIVATE).""" 41 #We don't need to parse the features... 42 from Bio.GenBank.Scanner import EmblScanner 43 records = EmblScanner().parse_records(in_handle, do_features=False) 44 #For FASTA output we can ignore the alphabet too 45 return SeqIO.write(records, out_handle, "fasta")
46 47
48 -def _fastq_generic(in_handle, out_handle, mapping):
49 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 50 from Bio.SeqIO.QualityIO import FastqGeneralIterator 51 #For real speed, don't even make SeqRecord and Seq objects! 52 count = 0 53 null = chr(0) 54 for title, seq, old_qual in FastqGeneralIterator(in_handle): 55 count += 1 56 #map the qual... 57 qual = old_qual.translate(mapping) 58 if null in qual: 59 raise ValueError("Invalid character in quality string") 60 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 61 return count
62 63
64 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
65 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 66 from Bio.SeqIO.QualityIO import FastqGeneralIterator 67 #For real speed, don't even make SeqRecord and Seq objects! 68 count = 0 69 null = chr(0) 70 for title, seq, old_qual in FastqGeneralIterator(in_handle): 71 count += 1 72 #map the qual... 73 qual = old_qual.translate(mapping) 74 if null in qual: 75 raise ValueError("Invalid character in quality string") 76 if truncate_char in qual: 77 qual = qual.replace(truncate_char, chr(126)) 78 import warnings 79 warnings.warn(truncate_msg) 80 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 81 return count
82 83
84 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
85 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 86 87 Useful for removing line wrapping and the redundant second identifier 88 on the plus lines. Will check also check the quality string is valid. 89 90 Avoids creating SeqRecord and Seq objects in order to speed up this 91 conversion. 92 """ 93 #Map unexpected chars to null 94 mapping = "".join([chr(0) for ascii in range(0, 33)] \ 95 +[chr(ascii) for ascii in range(33, 127)] \ 96 +[chr(0) for ascii in range(127, 256)]) 97 assert len(mapping)==256 98 return _fastq_generic(in_handle, out_handle, mapping)
99 100
101 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
102 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 103 104 Useful for removing line wrapping and the redundant second identifier 105 on the plus lines. Will check also check the quality string is valid. 106 Avoids creating SeqRecord and Seq objects in order to speed up this 107 conversion. 108 """ 109 #Map unexpected chars to null 110 mapping = "".join([chr(0) for ascii in range(0, 59)] \ 111 +[chr(ascii) for ascii in range(59, 127)] \ 112 +[chr(0) for ascii in range(127, 256)]) 113 assert len(mapping)==256 114 return _fastq_generic(in_handle, out_handle, mapping)
115 116
117 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
118 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 119 120 Useful for removing line wrapping and the redundant second identifier 121 on the plus lines. Will check also check the quality string is valid. 122 Avoids creating SeqRecord and Seq objects in order to speed up this 123 conversion. 124 """ 125 #Map unexpected chars to null 126 mapping = "".join([chr(0) for ascii in range(0, 64)] \ 127 +[chr(ascii) for ascii in range(64,127)] \ 128 +[chr(0) for ascii in range(127,256)]) 129 assert len(mapping)==256 130 return _fastq_generic(in_handle, out_handle, mapping)
131 132
133 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
134 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 135 136 Avoids creating SeqRecord and Seq objects in order to speed up this 137 conversion. 138 """ 139 #Map unexpected chars to null 140 mapping = "".join([chr(0) for ascii in range(0, 64)] \ 141 +[chr(33+q) for q in range(0, 62+1)] \ 142 +[chr(0) for ascii in range(127, 256)]) 143 assert len(mapping)==256 144 return _fastq_generic(in_handle, out_handle, mapping)
145 146
147 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
148 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 149 150 Avoids creating SeqRecord and Seq objects in order to speed up this 151 conversion. Will issue a warning if the scores had to be truncated at 62 152 (maximum possible in the Illumina 1.3+ FASTQ format) 153 """ 154 #Map unexpected chars to null 155 trunc_char = chr(1) 156 mapping = "".join([chr(0) for ascii in range(0, 33)] \ 157 +[chr(64+q) for q in range(0, 62+1) ] \ 158 +[trunc_char for ascii in range(96,127)] \ 159 +[chr(0) for ascii in range(127, 256)]) 160 assert len(mapping)==256 161 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 162 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
163 164
165 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
166 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 167 168 Avoids creating SeqRecord and Seq objects in order to speed up this 169 conversion. 170 """ 171 #Map unexpected chars to null 172 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 173 mapping = "".join([chr(0) for ascii in range(0, 59)] \ 174 +[chr(33+int(round(phred_quality_from_solexa(q)))) \ 175 for q in range(-5, 62+1)]\ 176 +[chr(0) for ascii in range(127, 256)]) 177 assert len(mapping)==256 178 return _fastq_generic(in_handle, out_handle, mapping)
179
180 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
181 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 182 183 Avoids creating SeqRecord and Seq objects in order to speed up this 184 conversion. Will issue a warning if the scores had to be truncated at 62 185 (maximum possible in the Solexa FASTQ format) 186 """ 187 #Map unexpected chars to null 188 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 189 trunc_char = chr(1) 190 mapping = "".join([chr(0) for ascii in range(0, 33)] \ 191 +[chr(64+int(round(solexa_quality_from_phred(q)))) \ 192 for q in range(0, 62+1)] \ 193 +[trunc_char for ascii in range(96, 127)] \ 194 +[chr(0) for ascii in range(127, 256)]) 195 assert len(mapping)==256 196 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 197 "Data loss - max Solexa quality 62 in Solexa FASTQ")
198 199
200 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
201 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 202 203 Avoids creating SeqRecord and Seq objects in order to speed up this 204 conversion. 205 """ 206 #Map unexpected chars to null 207 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 208 mapping = "".join([chr(0) for ascii in range(0, 59)] \ 209 +[chr(64+int(round(phred_quality_from_solexa(q)))) \ 210 for q in range(-5, 62+1)]\ 211 +[chr(0) for ascii in range(127, 256)]) 212 assert len(mapping)==256 213 return _fastq_generic(in_handle, out_handle, mapping)
214 215
216 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
217 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 218 219 Avoids creating SeqRecord and Seq objects in order to speed up this 220 conversion. 221 """ 222 #Map unexpected chars to null 223 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 224 trunc_char = chr(1) 225 mapping = "".join([chr(0) for ascii in range(0, 64)] \ 226 +[chr(64+int(round(solexa_quality_from_phred(q)))) \ 227 for q in range(0, 62+1)] \ 228 +[chr(0) for ascii in range(127, 256)]) 229 assert len(mapping)==256 230 return _fastq_generic(in_handle, out_handle, mapping)
231 232
233 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
234 """Fast FASTQ to FASTA conversion (PRIVATE). 235 236 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 237 Seq objects in order to speed up this conversion. 238 239 NOTE - This does NOT check the characters used in the FASTQ quality string 240 are valid! 241 """ 242 from Bio.SeqIO.QualityIO import FastqGeneralIterator 243 #For real speed, don't even make SeqRecord and Seq objects! 244 count = 0 245 for title, seq, qual in FastqGeneralIterator(in_handle): 246 count += 1 247 out_handle.write(">%s\n" % title) 248 #Do line wrapping 249 for i in range(0, len(seq), 60): 250 out_handle.write(seq[i:i+60] + "\n") 251 return count
252
253 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
254 """Fast FASTQ to simple tabbed conversion (PRIVATE). 255 256 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 257 Seq objects in order to speed up this conversion. 258 259 NOTE - This does NOT check the characters used in the FASTQ quality string 260 are valid! 261 """ 262 from Bio.SeqIO.QualityIO import FastqGeneralIterator 263 #For real speed, don't even make SeqRecord and Seq objects! 264 count = 0 265 for title, seq, qual in FastqGeneralIterator(in_handle): 266 count += 1 267 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) 268 return count
269
270 -def _fastq_convert_qual(in_handle, out_handle, mapping):
271 """FASTQ helper function for QUAL output (PRIVATE). 272 273 Mapping should be a dictionary mapping expected ASCII characters from the 274 FASTQ quality string to PHRED quality scores (as strings). 275 """ 276 from Bio.SeqIO.QualityIO import FastqGeneralIterator 277 #For real speed, don't even make SeqRecord and Seq objects! 278 count = 0 279 for title, seq, qual in FastqGeneralIterator(in_handle): 280 count += 1 281 out_handle.write(">%s\n" % title) 282 #map the qual... 283 try: 284 qualities_strs = [mapping[ascii] for ascii in qual] 285 except KeyError: 286 raise ValueError("Invalid character in quality string") 287 data = " ".join(qualities_strs) 288 while True: 289 if len(data) <= 60: 290 out_handle.write(data + "\n") 291 break 292 else: 293 #By construction there must be spaces in the first 60 chars 294 #(unless we have 60 digit or higher quality scores!) 295 i = data.rfind(" ", 0, 60) 296 out_handle.write(data[:i] + "\n") 297 data = data[i+1:] 298 return count
299 300
301 -def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
302 """Fast Sanger FASTQ to QUAL conversion (PRIVATE).""" 303 mapping = dict((chr(q+33), str(q)) for q in range(0,93+1)) 304 return _fastq_convert_qual(in_handle, out_handle, mapping)
305 306
307 -def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
308 """Fast Solexa FASTQ to QUAL conversion (PRIVATE).""" 309 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 310 mapping = dict((chr(q+64), str(int(round(phred_quality_from_solexa(q))))) \ 311 for q in range(-5,62+1)) 312 return _fastq_convert_qual(in_handle, out_handle, mapping)
313 314
315 -def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
316 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE).""" 317 mapping = dict((chr(q+64), str(q)) for q in range(0,62+1)) 318 return _fastq_convert_qual(in_handle, out_handle, mapping)
319 320 321 #TODO? - Handling aliases explicitly would let us shorten this list: 322 _converter = { 323 ("genbank", "fasta") : _genbank_convert_fasta, 324 ("gb", "fasta") : _genbank_convert_fasta, 325 ("embl", "fasta") : _embl_convert_fasta, 326 ("fastq", "fasta") : _fastq_convert_fasta, 327 ("fastq-sanger", "fasta") : _fastq_convert_fasta, 328 ("fastq-solexa", "fasta") : _fastq_convert_fasta, 329 ("fastq-illumina", "fasta") : _fastq_convert_fasta, 330 ("fastq", "tab") : _fastq_convert_tab, 331 ("fastq-sanger", "tab") : _fastq_convert_tab, 332 ("fastq-solexa", "tab") : _fastq_convert_tab, 333 ("fastq-illumina", "tab") : _fastq_convert_tab, 334 ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger, 335 ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger, 336 ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger, 337 ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger, 338 ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger, 339 ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger, 340 ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger, 341 ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger, 342 ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa, 343 ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa, 344 ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa, 345 ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa, 346 ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina, 347 ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina, 348 ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina, 349 ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina, 350 ("fastq", "qual") : _fastq_sanger_convert_qual, 351 ("fastq-sanger", "qual") : _fastq_sanger_convert_qual, 352 ("fastq-solexa", "qual") : _fastq_solexa_convert_qual, 353 ("fastq-illumina", "qual") : _fastq_illumina_convert_qual, 354 } 355
356 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
357 """SeqIO conversion function (PRIVATE).""" 358 try: 359 f = _converter[(in_format, out_format)] 360 except KeyError: 361 f = None 362 if f: 363 return f(in_handle, out_handle, alphabet) 364 else: 365 records = SeqIO.parse(in_handle, in_format, alphabet) 366 return SeqIO.write(records, out_handle, out_format)
367