1
2
3
4
5
6
7 """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
8
9 SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
10 Biomedical Research and the Wellcome Trust Sanger Institute. You are expected
11 to use this module via the Bio.SeqIO functions under the format name "sff" (or
12 "sff-trim" as described below).
13
14 For example, to iterate over the records in an SFF file,
15
16 >>> from Bio import SeqIO
17 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
18 ... print record.id, len(record), record.seq[:20]+"..."
19 E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
20 E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
21 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
22 E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
23 E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
24 E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
25 E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
26 E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
27 E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
28 E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
29
30 Each SeqRecord object will contain all the annotation from the SFF file,
31 including the PHRED quality scores.
32
33 >>> print record.id, len(record)
34 E3MFGYR02F7Z7G 219
35 >>> print record.seq[:10], "..."
36 tcagAATCAT ...
37 >>> print record.letter_annotations["phred_quality"][:10], "..."
38 [22, 21, 23, 28, 26, 15, 12, 21, 28, 21] ...
39
40 Notice that the sequence is given in mixed case, the central upper case region
41 corresponds to the trimmed sequence. This matches the output of the Roche
42 tools (and the 3rd party tool sff_extract) for SFF to FASTA.
43
44 >>> print record.annotations["clip_qual_left"]
45 4
46 >>> print record.annotations["clip_qual_right"]
47 134
48 >>> print record.seq[:4]
49 tcag
50 >>> print record.seq[4:20], "...", record.seq[120:134]
51 AATCATCCACTTTTTA ... CAAAACACAAACAG
52 >>> print record.seq[134:]
53 atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
54
55 The annotations dictionary also contains any adapter clip positions
56 (usually zero), and information about the flows. e.g.
57
58 >>> print record.annotations["flow_key"]
59 TCAG
60 >>> print record.annotations["flow_values"][:10], "..."
61 (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) ...
62 >>> print len(record.annotations["flow_values"])
63 400
64 >>> print record.annotations["flow_index"][:10], "..."
65 (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) ...
66 >>> print len(record.annotations["flow_index"])
67 219
68
69 Note that to convert from a raw reading in flow_values to the corresponding
70 homopolymer stretch estimate, the value should be rounded to the nearest 100:
71
72 >>> print [int(round(value, -2)) // 100
73 ... for value in record.annotations["flow_values"][:10]], '...'
74 [1, 0, 1, 0, 0, 1, 0, 1, 0, 2] ...
75
76 As a convenience method, you can read the file with SeqIO format name "sff-trim"
77 instead of "sff" to get just the trimmed sequences (without any annotation
78 except for the PHRED quality scores):
79
80 >>> from Bio import SeqIO
81 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
82 ... print record.id, len(record), record.seq[:20]+"..."
83 E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
84 E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
85 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
86 E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
87 E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
88 E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
89 E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
90 E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
91 E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
92 E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
93
94 Looking at the final record in more detail, note how this differs to the
95 example above:
96
97 >>> print record.id, len(record)
98 E3MFGYR02F7Z7G 130
99 >>> print record.seq[:10], "..."
100 AATCATCCAC ...
101 >>> print record.letter_annotations["phred_quality"][:10], "..."
102 [26, 15, 12, 21, 28, 21, 36, 28, 27, 27] ...
103 >>> print record.annotations
104 {}
105
106 You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
107 reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
108
109 >>> from Bio import SeqIO
110 >>> from StringIO import StringIO
111 >>> out_handle = StringIO()
112 >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
113 ... out_handle, "fastq")
114 >>> print "Converted %i records" % count
115 Converted 10 records
116
117 The output FASTQ file would start like this:
118
119 >>> print "%s..." % out_handle.getvalue()[:50]
120 @E3MFGYR02JWQ7T
121 tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
122
123 Bio.SeqIO.index() provides memory efficient random access to the reads in an
124 SFF file by name. SFF files can include an index within the file, which can
125 be read in making this very fast. If the index is missing (or in a format not
126 yet supported in Biopython) the file is indexed by scanning all the reads -
127 which is a little slower. For example,
128
129 >>> from Bio import SeqIO
130 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
131 >>> record = reads["E3MFGYR02JHD4H"]
132 >>> print record.id, len(record), record.seq[:20]+"..."
133 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
134
135 Or, using the trimmed reads:
136
137 >>> from Bio import SeqIO
138 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
139 >>> record = reads["E3MFGYR02JHD4H"]
140 >>> print record.id, len(record), record.seq[:20]+"..."
141 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
142
143 You can also use the Bio.SeqIO.write() function with the "sff" format. Note
144 that this requires all the flow information etc, and thus is probably only
145 useful for SeqRecord objects originally from reading another SFF file (and
146 not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
147
148 As an example, let's pretend this example SFF file represents some DNA which
149 was pre-amplified with a PCR primers AAAGANNNNN. The following script would
150 produce a sub-file containing all those reads whose post-quality clipping
151 region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
152 degenerate bit of this pretend primer):
153
154 >>> from Bio import SeqIO
155 >>> records = (record for record in
156 ... SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff","sff")
157 ... if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
158 >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
159 >>> print "Selected %i records" % count
160 Selected 2 records
161
162 Of course, for an assembly you would probably want to remove these primers.
163 If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
164 if you want SFF output we have to preserve all the flow information - the trick
165 is just to adjust the left clip position!
166
167 >>> from Bio import SeqIO
168 >>> def filter_and_trim(records, primer):
169 ... for record in records:
170 ... if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
171 ... record.annotations["clip_qual_left"] += len(primer)
172 ... yield record
173 >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
174 >>> count = SeqIO.write(filter_and_trim(records,"AAAGA"),
175 ... "temp_filtered.sff", "sff")
176 >>> print "Selected %i records" % count
177 Selected 2 records
178
179 We can check the results, note the lower case clipped region now includes the "AAAGA"
180 sequence:
181
182 >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
183 ... print record.id, len(record), record.seq[:20]+"..."
184 E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
185 E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
186 >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
187 ... print record.id, len(record), record.seq[:20]+"..."
188 E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
189 E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
190 >>> import os
191 >>> os.remove("temp_filtered.sff")
192
193 For a description of the file format, please see the Roche manuals and:
194 http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
195
196 """
197 from Bio.SeqIO.Interfaces import SequenceWriter
198 from Bio import Alphabet
199 from Bio.Seq import Seq
200 from Bio.SeqRecord import SeqRecord
201 import struct
202 import sys
203
204 from Bio._py3k import _bytes_to_string, _as_bytes
205 _null = _as_bytes("\0")
206 _sff = _as_bytes(".sff")
207 _hsh = _as_bytes(".hsh")
208 _srt = _as_bytes(".srt")
209 _mft = _as_bytes(".mft")
210
211 try:
212
213 _flag = eval(r'b"\xff"')
214 except SyntaxError:
215
216 _flag = "\xff"
217
219 """Read in an SFF file header (PRIVATE).
220
221 Assumes the handle is at the start of the file, will read forwards
222 though the header and leave the handle pointing at the first record.
223 Returns a tuple of values from the header (header_length, index_offset,
224 index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
225
226 >>> handle = open("Roche/greek.sff", "rb")
227 >>> values = _sff_file_header(handle)
228 >>> print values[0]
229 840
230 >>> print values[1]
231 65040
232 >>> print values[2]
233 256
234 >>> print values[3]
235 24
236 >>> print values[4]
237 800
238 >>> values[-1]
239 'TCAG'
240
241 """
242 if hasattr(handle,"mode") and "U" in handle.mode.upper():
243 raise ValueError("SFF files must NOT be opened in universal new "
244 "lines mode. Binary mode is recommended (although "
245 "on Unix the default mode is also fine).")
246 elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
247 and sys.platform == "win32":
248 raise ValueError("SFF files must be opened in binary mode on Windows")
249
250
251
252
253
254
255
256
257
258
259
260
261 fmt = '>4s4BQIIHHHB'
262 assert 31 == struct.calcsize(fmt)
263 data = handle.read(31)
264 if not data:
265 raise ValueError("Empty file.")
266 elif len(data) < 13:
267 raise ValueError("File too small to hold a valid SFF header.")
268 magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
269 number_of_reads, header_length, key_length, number_of_flows_per_read, \
270 flowgram_format = struct.unpack(fmt, data)
271 if magic_number in [_hsh, _srt, _mft]:
272
273 raise ValueError("Handle seems to be at SFF index block, not start")
274 if magic_number != _sff:
275 raise ValueError("SFF file did not start '.sff', but %s" \
276 % repr(magic_number))
277 if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
278 raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" \
279 % (ver0, ver1, ver2, ver3))
280 if flowgram_format != 1:
281 raise ValueError("Flowgram format code %i not supported" \
282 % flowgram_format)
283 if (index_offset!=0) ^ (index_length!=0):
284 raise ValueError("Index offset %i but index length %i" \
285 % (index_offset, index_length))
286 flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read))
287 key_sequence = _bytes_to_string(handle.read(key_length))
288
289
290
291
292 assert header_length % 8 == 0
293 padding = header_length - number_of_flows_per_read - key_length - 31
294 assert 0 <= padding < 8, padding
295 if handle.read(padding).count(_null) != padding:
296 raise ValueError("Post header %i byte padding region contained data" \
297 % padding)
298 return header_length, index_offset, index_length, \
299 number_of_reads, number_of_flows_per_read, \
300 flow_chars, key_sequence
301
302
304 """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
305
306 This is a slow but generic approach if we can't parse the provided index
307 (if present).
308
309 Will use the handle seek/tell functions.
310 """
311 handle.seek(0)
312 header_length, index_offset, index_length, number_of_reads, \
313 number_of_flows_per_read, flow_chars, key_sequence \
314 = _sff_file_header(handle)
315
316 read_header_fmt = '>2HI4H'
317 read_header_size = struct.calcsize(read_header_fmt)
318
319 read_flow_fmt = ">%iH" % number_of_flows_per_read
320 read_flow_size = struct.calcsize(read_flow_fmt)
321 assert 1 == struct.calcsize(">B")
322 assert 1 == struct.calcsize(">s")
323 assert 1 == struct.calcsize(">c")
324 assert read_header_size % 8 == 0
325 for read in range(number_of_reads):
326 record_offset = handle.tell()
327 if record_offset == index_offset:
328
329 offset = index_offset + index_length
330 if offset % 8:
331 offset += 8 - (offset % 8)
332 assert offset % 8 == 0
333 handle.seek(offset)
334 record_offset = offset
335
336
337 data = handle.read(read_header_size)
338 read_header_length, name_length, seq_len, clip_qual_left, \
339 clip_qual_right, clip_adapter_left, clip_adapter_right \
340 = struct.unpack(read_header_fmt, data)
341 if read_header_length < 10 or read_header_length % 8 != 0:
342 raise ValueError("Malformed read header, says length is %i:\n%s" \
343 % (read_header_length, repr(data)))
344
345 name = _bytes_to_string(handle.read(name_length))
346 padding = read_header_length - read_header_size - name_length
347 if handle.read(padding).count(_null) != padding:
348 raise ValueError("Post name %i byte padding region contained data" \
349 % padding)
350 assert record_offset + read_header_length == handle.tell()
351
352 size = read_flow_size + 3*seq_len
353 handle.seek(size, 1)
354
355 padding = size % 8
356 if padding:
357 padding = 8 - padding
358 if handle.read(padding).count(_null) != padding:
359 raise ValueError("Post quality %i byte padding region contained data" \
360 % padding)
361
362 yield name, record_offset
363 if handle.tell() % 8 != 0:
364 raise ValueError("After scanning reads, did not end on a multiple of 8")
365
367 """Locate any existing Roche style XML meta data and read index (PRIVATE).
368
369 Makes a number of hard coded assumptions based on reverse engineered SFF
370 files from Roche 454 machines.
371
372 Returns a tuple of read count, SFF "index" offset and size, XML offset
373 and size, and the actual read index offset and size.
374
375 Raises a ValueError for unsupported or non-Roche index blocks.
376 """
377 handle.seek(0)
378 header_length, index_offset, index_length, number_of_reads, \
379 number_of_flows_per_read, flow_chars, key_sequence \
380 = _sff_file_header(handle)
381 assert handle.tell() == header_length
382 if not index_offset or not index_offset:
383 raise ValueError("No index present in this SFF file")
384
385 handle.seek(index_offset)
386 fmt = ">4s4B"
387 fmt_size = struct.calcsize(fmt)
388 data = handle.read(fmt_size)
389 if not data:
390 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
391 % (index_length, index_offset))
392 if len(data) < fmt_size:
393 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
394 % (index_length, index_offset, repr(data)))
395 magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
396 if magic_number == _mft:
397
398
399
400 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
401
402 raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" \
403 % (ver0, ver1, ver2, ver3))
404 fmt2 = ">LL"
405 fmt2_size = struct.calcsize(fmt2)
406 xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
407 if index_length != fmt_size + fmt2_size + xml_size + data_size:
408 raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" \
409 % (index_length, fmt_size, fmt2_size, xml_size, data_size))
410 return number_of_reads, header_length, \
411 index_offset, index_length, \
412 index_offset + fmt_size + fmt2_size, xml_size, \
413 index_offset + fmt_size + fmt2_size + xml_size, data_size
414 elif magic_number == _srt:
415
416
417
418 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
419
420 raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" \
421 % (ver0, ver1, ver2, ver3))
422 data = handle.read(4)
423 if data != _null*4:
424 raise ValueError("Did not find expected null four bytes in .srt index")
425 return number_of_reads, header_length, \
426 index_offset, index_length, \
427 0, 0, \
428 index_offset + fmt_size + 4, index_length - fmt_size - 4
429 elif magic_number == _hsh:
430 raise ValueError("Hash table style indexes (.hsh) in SFF files are "
431 "not (yet) supported")
432 else:
433 raise ValueError("Unknown magic number %s in SFF index header:\n%s" \
434 % (repr(magic_number), repr(data)))
435
437 """Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE, DEPRECATED).
438
439 Will use the handle seek/tell functions. Returns a string.
440
441 This has been replaced by ReadRocheXmlManifest. We would normally just
442 delete an old private function without warning, but I believe some people
443 are using this so we'll handle this with a deprecation warning.
444 """
445 import warnings
446 warnings.warn("Private function _sff_read_roche_index_xml is deprecated. "
447 "Use new public function ReadRocheXmlManifest instead",
448 DeprecationWarning)
449 return ReadRocheXmlManifest(handle)
450
451
453 """Reads any Roche style XML manifest data in the SFF "index".
454
455 The SFF file format allows for multiple different index blocks, and Roche
456 took advantage of this to define their own index block wich also embeds
457 an XML manifest string. This is not a publically documented extension to
458 the SFF file format, this was reverse engineered.
459
460 The handle should be to an SFF file opened in binary mode. This function
461 will use the handle seek/tell functions and leave the handle in an
462 arbitrary location.
463
464 Any XML manifest found is returned as a Python string, which you can then
465 parse as appropriate, or reuse when writing out SFF files with the
466 SffWriter class.
467
468 Returns a string, or raises a ValueError if an Roche manifest could not be
469 found.
470 """
471 number_of_reads, header_length, index_offset, index_length, xml_offset, \
472 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
473 if not xml_offset or not xml_size:
474 raise ValueError("No XML manifest found")
475 handle.seek(xml_offset)
476 return _bytes_to_string(handle.read(xml_size))
477
478
479
481 """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
482
483 Will use the handle seek/tell functions.
484
485 This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
486
487 Roche SFF indices use base 255 not 256, meaning we see bytes in range the
488 range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
489 can be used as a marker character to separate entries (required if the
490 read name lengths vary).
491
492 Note that since only four bytes are used for the read offset, this is
493 limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
494 tool to combine SFF files beyound this limit, they issue a warning and
495 omit the index (and manifest).
496 """
497 number_of_reads, header_length, index_offset, index_length, xml_offset, \
498 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
499
500 handle.seek(read_index_offset)
501 fmt = ">5B"
502 for read in range(number_of_reads):
503
504 data = handle.read(6)
505 while True:
506 more = handle.read(1)
507 if not more:
508 raise ValueError("Premature end of file!")
509 data += more
510 if more == _flag: break
511 assert data[-1:] == _flag, data[-1:]
512 name = _bytes_to_string(data[:-6])
513 off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
514 offset = off0 + 255*off1 + 65025*off2 + 16581375*off3
515 if off4:
516
517
518
519 raise ValueError("Expected a null terminator to the read name.")
520 yield name, offset
521 if handle.tell() != read_index_offset + read_index_size:
522 raise ValueError("Problem with index length? %i vs %i" \
523 % (handle.tell(), read_index_offset + read_index_size))
524
525
526 -def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars,
527 key_sequence, alphabet, trim=False):
528 """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
529
530
531
532
533
534
535
536
537
538
539 read_header_fmt = '>2HI4H'
540 read_header_size = struct.calcsize(read_header_fmt)
541 read_flow_fmt = ">%iH" % number_of_flows_per_read
542 read_flow_size = struct.calcsize(read_flow_fmt)
543
544 read_header_length, name_length, seq_len, clip_qual_left, \
545 clip_qual_right, clip_adapter_left, clip_adapter_right \
546 = struct.unpack(read_header_fmt, handle.read(read_header_size))
547 if clip_qual_left:
548 clip_qual_left -= 1
549 if clip_adapter_left:
550 clip_adapter_left -= 1
551 if read_header_length < 10 or read_header_length % 8 != 0:
552 raise ValueError("Malformed read header, says length is %i" \
553 % read_header_length)
554
555 name = _bytes_to_string(handle.read(name_length))
556 padding = read_header_length - read_header_size - name_length
557 if handle.read(padding).count(_null) != padding:
558 raise ValueError("Post name %i byte padding region contained data" \
559 % padding)
560
561
562 flow_values = handle.read(read_flow_size)
563 temp_fmt = ">%iB" % seq_len
564 flow_index = handle.read(seq_len)
565 seq = _bytes_to_string(handle.read(seq_len))
566 quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
567
568 padding = (read_flow_size + seq_len*3)%8
569 if padding:
570 padding = 8 - padding
571 if handle.read(padding).count(_null) != padding:
572 raise ValueError("Post quality %i byte padding region contained data" \
573 % padding)
574
575
576
577 clip_left = max(clip_qual_left, clip_adapter_left)
578
579 if clip_qual_right:
580 if clip_adapter_right:
581 clip_right = min(clip_qual_right, clip_adapter_right)
582 else:
583
584 clip_right = clip_qual_right
585 elif clip_adapter_right:
586 clip_right = clip_adapter_right
587 else:
588 clip_right = seq_len
589
590 if trim:
591 seq = seq[clip_left:clip_right].upper()
592 quals = quals[clip_left:clip_right]
593
594 annotations = {}
595 else:
596
597 seq = seq[:clip_left].lower() + \
598 seq[clip_left:clip_right].upper() + \
599 seq[clip_right:].lower()
600 annotations = {"flow_values":struct.unpack(read_flow_fmt, flow_values),
601 "flow_index":struct.unpack(temp_fmt, flow_index),
602 "flow_chars":flow_chars,
603 "flow_key":key_sequence,
604 "clip_qual_left":clip_qual_left,
605 "clip_qual_right":clip_qual_right,
606 "clip_adapter_left":clip_adapter_left,
607 "clip_adapter_right":clip_adapter_right}
608 record = SeqRecord(Seq(seq, alphabet),
609 id=name,
610 name=name,
611 description="",
612 annotations=annotations)
613
614
615 dict.__setitem__(record._per_letter_annotations,
616 "phred_quality", quals)
617
618 return record
619
620
622 """Extract the next read in the file as a raw (bytes) string (PRIVATE)."""
623 read_header_fmt = '>2HI'
624 read_header_size = struct.calcsize(read_header_fmt)
625 read_flow_fmt = ">%iH" % number_of_flows_per_read
626 read_flow_size = struct.calcsize(read_flow_fmt)
627
628 raw = handle.read(read_header_size)
629 read_header_length, name_length, seq_len \
630 = struct.unpack(read_header_fmt, raw)
631 if read_header_length < 10 or read_header_length % 8 != 0:
632 raise ValueError("Malformed read header, says length is %i" \
633 % read_header_length)
634
635 raw += handle.read(8 + name_length)
636
637 padding = read_header_length - read_header_size - 8 - name_length
638 pad = handle.read(padding)
639 if pad.count(_null) != padding:
640 raise ValueError("Post name %i byte padding region contained data" \
641 % padding)
642 raw += pad
643
644 raw += handle.read(read_flow_size + seq_len*3)
645 padding = (read_flow_size + seq_len*3)%8
646
647 if padding:
648 padding = 8 - padding
649 pad = handle.read(padding)
650 if pad.count(_null) != padding:
651 raise ValueError("Post quality %i byte padding region contained data" \
652 % padding)
653 raw += pad
654
655 return raw
656
658 """Wrapper for handles which do not support the tell method (PRIVATE).
659
660 Intended for use with things like network handles where tell (and reverse
661 seek) are not supported. The SFF file needs to track the current offset in
662 order to deal with the index block.
663 """
665 self._handle = handle
666 self._offset = 0
667
668 - def read(self, length):
672
675
676 - def seek(self, offset):
677 if offset < self._offset:
678 raise RunTimeError("Can't seek backwards")
679 self._handle.read(offset - self._offset)
680
682 return self._handle.close()
683
684
685
687 """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
688
689 handle - input file, an SFF file, e.g. from Roche 454 sequencing.
690 This must NOT be opened in universal read lines mode!
691 alphabet - optional alphabet, defaults to generic DNA.
692 trim - should the sequences be trimmed?
693
694 The resulting SeqRecord objects should match those from a paired FASTA
695 and QUAL file converted from the SFF file using the Roche 454 tool
696 ssfinfo. i.e. The sequence will be mixed case, with the trim regions
697 shown in lower case.
698
699 This function is used internally via the Bio.SeqIO functions:
700
701 >>> from Bio import SeqIO
702 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
703 >>> for record in SeqIO.parse(handle, "sff"):
704 ... print record.id, len(record)
705 E3MFGYR02JWQ7T 265
706 E3MFGYR02JA6IL 271
707 E3MFGYR02JHD4H 310
708 E3MFGYR02GFKUC 299
709 E3MFGYR02FTGED 281
710 E3MFGYR02FR9G7 261
711 E3MFGYR02GAZMS 278
712 E3MFGYR02HHZ8O 221
713 E3MFGYR02GPGB1 269
714 E3MFGYR02F7Z7G 219
715 >>> handle.close()
716
717 You can also call it directly:
718
719 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
720 >>> for record in SffIterator(handle):
721 ... print record.id, len(record)
722 E3MFGYR02JWQ7T 265
723 E3MFGYR02JA6IL 271
724 E3MFGYR02JHD4H 310
725 E3MFGYR02GFKUC 299
726 E3MFGYR02FTGED 281
727 E3MFGYR02FR9G7 261
728 E3MFGYR02GAZMS 278
729 E3MFGYR02HHZ8O 221
730 E3MFGYR02GPGB1 269
731 E3MFGYR02F7Z7G 219
732 >>> handle.close()
733
734 Or, with the trim option:
735
736 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
737 >>> for record in SffIterator(handle, trim=True):
738 ... print record.id, len(record)
739 E3MFGYR02JWQ7T 260
740 E3MFGYR02JA6IL 265
741 E3MFGYR02JHD4H 292
742 E3MFGYR02GFKUC 295
743 E3MFGYR02FTGED 277
744 E3MFGYR02FR9G7 256
745 E3MFGYR02GAZMS 271
746 E3MFGYR02HHZ8O 150
747 E3MFGYR02GPGB1 221
748 E3MFGYR02F7Z7G 130
749 >>> handle.close()
750
751 """
752 if isinstance(Alphabet._get_base_alphabet(alphabet),
753 Alphabet.ProteinAlphabet):
754 raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
755 if isinstance(Alphabet._get_base_alphabet(alphabet),
756 Alphabet.RNAAlphabet):
757 raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
758 try:
759 assert 0 == handle.tell()
760 except AttributeError:
761
762 handle = _AddTellHandle(handle)
763 header_length, index_offset, index_length, number_of_reads, \
764 number_of_flows_per_read, flow_chars, key_sequence \
765 = _sff_file_header(handle)
766
767
768
769
770
771
772
773
774
775
776 read_header_fmt = '>2HI4H'
777 read_header_size = struct.calcsize(read_header_fmt)
778 read_flow_fmt = ">%iH" % number_of_flows_per_read
779 read_flow_size = struct.calcsize(read_flow_fmt)
780 assert 1 == struct.calcsize(">B")
781 assert 1 == struct.calcsize(">s")
782 assert 1 == struct.calcsize(">c")
783 assert read_header_size % 8 == 0
784
785
786
787 for read in range(number_of_reads):
788 if index_offset and handle.tell() == index_offset:
789 offset = index_offset + index_length
790 if offset % 8:
791 offset += 8 - (offset % 8)
792 assert offset % 8 == 0
793 handle.seek(offset)
794
795
796 index_offset = 0
797 yield _sff_read_seq_record(handle,
798 number_of_flows_per_read,
799 flow_chars,
800 key_sequence,
801 alphabet,
802 trim)
803
804
805 if index_offset and handle.tell() == index_offset:
806 offset = index_offset + index_length
807 if offset % 8:
808 offset += 8 - (offset % 8)
809 assert offset % 8 == 0
810 handle.seek(offset)
811
812 if handle.read(1):
813 raise ValueError("Additional data at end of SFF file")
814
815
816
818 """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
819 return SffIterator(handle, alphabet, trim=True)
820
821
823 """SFF file writer."""
824
825 - def __init__(self, handle, index=True, xml=None):
826 """Creates the writer object.
827
828 handle - Output handle, ideally in binary write mode.
829 index - Boolean argument, should we try and write an index?
830 xml - Optional string argument, xml manifest to be recorded in the index
831 block (see function ReadRocheXmlManifest for reading this data).
832 """
833 if hasattr(handle,"mode") and "U" in handle.mode.upper():
834 raise ValueError("SFF files must NOT be opened in universal new "
835 "lines mode. Binary mode is required")
836 elif hasattr(handle,"mode") and "B" not in handle.mode.upper():
837 raise ValueError("SFF files must be opened in binary mode")
838 self.handle = handle
839 self._xml = xml
840 if index:
841 self._index = []
842 else:
843 self._index = None
844
846 """Use this to write an entire file containing the given records."""
847 try:
848 self._number_of_reads = len(records)
849 except TypeError:
850 self._number_of_reads = 0
851 if not hasattr(self.handle, "seek") \
852 or not hasattr(self.handle, "tell"):
853 raise ValueError("A handle with a seek/tell methods is "
854 "required in order to record the total "
855 "record count in the file header (once it "
856 "is known at the end).")
857 if self._index is not None and \
858 not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
859 import warnings
860 warnings.warn("A handle with a seek/tell methods is required in "
861 "order to record an SFF index.")
862 self._index = None
863 self._index_start = 0
864 self._index_length = 0
865 if not hasattr(records, "next"):
866 records = iter(records)
867
868
869 try:
870 record = records.next()
871 except StopIteration:
872 record = None
873 if record is None:
874
875
876
877 raise ValueError("Must have at least one sequence")
878 try:
879 self._key_sequence = _as_bytes(record.annotations["flow_key"])
880 self._flow_chars = _as_bytes(record.annotations["flow_chars"])
881 self._number_of_flows_per_read = len(self._flow_chars)
882 except KeyError:
883 raise ValueError("Missing SFF flow information")
884 self.write_header()
885 self.write_record(record)
886 count = 1
887 for record in records:
888 self.write_record(record)
889 count += 1
890 if self._number_of_reads == 0:
891
892 offset = self.handle.tell()
893 self.handle.seek(0)
894 self._number_of_reads = count
895 self.write_header()
896 self.handle.seek(offset)
897 else:
898 assert count == self._number_of_reads
899 if self._index is not None:
900 self._write_index()
901 return count
902
904 assert len(self._index)==self._number_of_reads
905 handle = self.handle
906 self._index.sort()
907 self._index_start = handle.tell()
908
909 if self._xml is not None:
910 xml = _as_bytes(self._xml)
911 else:
912 from Bio import __version__
913 xml = "<!-- This file was output with Biopython %s -->\n" % __version__
914 xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
915 xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
916 xml = _as_bytes(xml)
917 xml_len = len(xml)
918
919 fmt = ">I4BLL"
920 fmt_size = struct.calcsize(fmt)
921 handle.write(_null*fmt_size + xml)
922 fmt2 = ">6B"
923 assert 6 == struct.calcsize(fmt2)
924 self._index.sort()
925 index_len = 0
926 for name, offset in self._index:
927
928
929
930 off3 = offset
931 off0 = off3 % 255
932 off3 -= off0
933 off1 = off3 % 65025
934 off3 -= off1
935 off2 = off3 % 16581375
936 off3 -= off2
937 assert offset == off0 + off1 + off2 + off3, \
938 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
939 off3, off2, off1, off0 = off3//16581375, off2//65025, \
940 off1//255, off0
941 assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
942 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
943 handle.write(name + struct.pack(fmt2, 0, \
944 off3, off2, off1, off0, 255))
945 index_len += len(name) + 6
946
947 self._index_length = fmt_size + xml_len + index_len
948
949
950
951 if self._index_length % 8:
952 padding = 8 - (self._index_length%8)
953 handle.write(_null*padding)
954 else:
955 padding = 0
956 offset = handle.tell()
957 assert offset == self._index_start + self._index_length + padding, \
958 "%i vs %i + %i + %i" % (offset, self._index_start, \
959 self._index_length, padding)
960
961 handle.seek(self._index_start)
962 handle.write(struct.pack(fmt, 778921588,
963 49,46,48,48,
964 xml_len, index_len) + xml)
965
966 handle.seek(0)
967 self.write_header()
968 handle.seek(offset)
969
971
972 key_length = len(self._key_sequence)
973
974
975
976
977
978
979
980
981
982
983
984
985 fmt = '>I4BQIIHHHB%is%is' % (self._number_of_flows_per_read, key_length)
986
987
988
989
990 if struct.calcsize(fmt) % 8 == 0:
991 padding = 0
992 else:
993 padding = 8 - (struct.calcsize(fmt) % 8)
994 header_length = struct.calcsize(fmt) + padding
995 assert header_length % 8 == 0
996 header = struct.pack(fmt, 779314790,
997 0, 0, 0, 1,
998 self._index_start, self._index_length,
999 self._number_of_reads,
1000 header_length, key_length,
1001 self._number_of_flows_per_read,
1002 1,
1003 self._flow_chars, self._key_sequence)
1004 self.handle.write(header + _null*padding)
1005
1007 """Write a single additional record to the output file.
1008
1009 This assumes the header has been done.
1010 """
1011
1012 name = _as_bytes(record.id)
1013 name_len = len(name)
1014 seq = _as_bytes(str(record.seq).upper())
1015 seq_len = len(seq)
1016
1017 try:
1018 quals = record.letter_annotations["phred_quality"]
1019 except KeyError:
1020 raise ValueError("Missing PHRED qualities information")
1021
1022 try:
1023 flow_values = record.annotations["flow_values"]
1024 flow_index = record.annotations["flow_index"]
1025 if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \
1026 or self._flow_chars != _as_bytes(record.annotations["flow_chars"]):
1027 raise ValueError("Records have inconsistent SFF flow data")
1028 except KeyError:
1029 raise ValueError("Missing SFF flow information")
1030 except AttributeError:
1031 raise ValueError("Header not written yet?")
1032
1033 try:
1034 clip_qual_left = record.annotations["clip_qual_left"]
1035 if clip_qual_left:
1036 clip_qual_left += 1
1037 clip_qual_right = record.annotations["clip_qual_right"]
1038 clip_adapter_left = record.annotations["clip_adapter_left"]
1039 if clip_adapter_left:
1040 clip_adapter_left += 1
1041 clip_adapter_right = record.annotations["clip_adapter_right"]
1042 except KeyError:
1043 raise ValueError("Missing SFF clipping information")
1044
1045
1046 if self._index is not None:
1047 offset = self.handle.tell()
1048
1049
1050
1051 if offset > 4244897280:
1052 import warnings
1053 warnings.warn("Read %s has file offset %i, which is too large "
1054 "to store in the Roche SFF index structure. No "
1055 "index block will be recorded." % (name, offset))
1056
1057 self._index = None
1058 else:
1059 self._index.append((name, self.handle.tell()))
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075 read_header_fmt = '>2HI4H%is' % name_len
1076 if struct.calcsize(read_header_fmt) % 8 == 0:
1077 padding = 0
1078 else:
1079 padding = 8 - (struct.calcsize(read_header_fmt) % 8)
1080 read_header_length = struct.calcsize(read_header_fmt) + padding
1081 assert read_header_length % 8 == 0
1082 data = struct.pack(read_header_fmt,
1083 read_header_length,
1084 name_len, seq_len,
1085 clip_qual_left, clip_qual_right,
1086 clip_adapter_left, clip_adapter_right,
1087 name) + _null*padding
1088 assert len(data) == read_header_length
1089
1090
1091 read_flow_fmt = ">%iH" % self._number_of_flows_per_read
1092 read_flow_size = struct.calcsize(read_flow_fmt)
1093 temp_fmt = ">%iB" % seq_len
1094 data += struct.pack(read_flow_fmt, *flow_values) \
1095 + struct.pack(temp_fmt, *flow_index) \
1096 + seq \
1097 + struct.pack(temp_fmt, *quals)
1098
1099 padding = (read_flow_size + seq_len*3)%8
1100 if padding:
1101 padding = 8 - padding
1102 self.handle.write(data + _null*padding)
1103
1104
1105 if __name__ == "__main__":
1106 print "Running quick self test"
1107 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1108 metadata = ReadRocheXmlManifest(open(filename, "rb"))
1109 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1110 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1111 assert index1 == index2
1112 assert len(index1) == len(list(SffIterator(open(filename, "rb"))))
1113 from StringIO import StringIO
1114 try:
1115
1116 from io import BytesIO
1117 except ImportError:
1118 BytesIO = StringIO
1119 assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"rb").read()))))
1120
1121 if sys.platform != "win32":
1122 assert len(index1) == len(list(SffIterator(open(filename, "r"))))
1123 index2 = sorted(_sff_read_roche_index(open(filename)))
1124 assert index1 == index2
1125 index2 = sorted(_sff_do_slow_index(open(filename)))
1126 assert index1 == index2
1127 assert len(index1) == len(list(SffIterator(open(filename))))
1128 assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"r").read()))))
1129 assert len(index1) == len(list(SffIterator(BytesIO(open(filename).read()))))
1130
1131 sff = list(SffIterator(open(filename, "rb")))
1132
1133 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1134 assert len(sff) == len(sff2)
1135 for old, new in zip(sff, sff2):
1136 assert old.id == new.id
1137 assert str(old.seq) == str(new.seq)
1138
1139 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1140 assert len(sff) == len(sff2)
1141 for old, new in zip(sff, sff2):
1142 assert old.id == new.id
1143 assert str(old.seq) == str(new.seq)
1144
1145 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1146 assert len(sff) == len(sff2)
1147 for old, new in zip(sff, sff2):
1148 assert old.id == new.id
1149 assert str(old.seq) == str(new.seq)
1150
1151 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
1152 assert len(sff) == len(sff2)
1153 for old, new in zip(sff, sff2):
1154 assert old.id == new.id
1155 assert str(old.seq) == str(new.seq)
1156
1157 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
1158 assert len(sff) == len(sff2)
1159 for old, new in zip(sff, sff2):
1160 assert old.id == new.id
1161 assert str(old.seq) == str(new.seq)
1162
1163 sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
1164
1165 print ReadRocheXmlManifest(open(filename, "rb"))
1166
1167 from Bio import SeqIO
1168 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
1169 fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1170 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual"
1171 qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1172
1173 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta"
1174 fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1175 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual"
1176 qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1177
1178 for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim,
1179 qual_no_trim, fasta_trim, qual_trim):
1180
1181 print s.id
1182
1183
1184
1185 assert s.id == f.id == q.id
1186 assert str(s.seq) == str(f.seq)
1187 assert s.letter_annotations["phred_quality"] == q.letter_annotations["phred_quality"]
1188
1189 assert s.id == sT.id == fT.id == qT.id
1190 assert str(sT.seq) == str(fT.seq)
1191 assert sT.letter_annotations["phred_quality"] == qT.letter_annotations["phred_quality"]
1192
1193
1194 print "Writing with a list of SeqRecords..."
1195 handle = StringIO()
1196 w = SffWriter(handle, xml=metadata)
1197 w.write_file(sff)
1198 data = handle.getvalue()
1199 print "And again with an iterator..."
1200 handle = StringIO()
1201 w = SffWriter(handle, xml=metadata)
1202 w.write_file(iter(sff))
1203 assert data == handle.getvalue()
1204
1205 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1206 original = open(filename,"rb").read()
1207 assert len(data) == len(original)
1208 assert data == original
1209 del data
1210 handle.close()
1211
1212 print "-"*50
1213 filename = "../../Tests/Roche/greek.sff"
1214 for record in SffIterator(open(filename,"rb")):
1215 print record.id
1216 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1217 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1218 assert index1 == index2
1219 try:
1220 print ReadRocheXmlManifest(open(filename, "rb"))
1221 assert False, "Should fail!"
1222 except ValueError:
1223 pass
1224
1225
1226 handle = open(filename, "rb")
1227 for record in SffIterator(handle):
1228 pass
1229 try:
1230 for record in SffIterator(handle):
1231 print record.id
1232 assert False, "Should have failed"
1233 except ValueError, err:
1234 print "Checking what happens on re-reading a handle:"
1235 print err
1236
1237
1238 """
1239 #Ugly code to make test files...
1240 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1241 padding = len(index)%8
1242 if padding:
1243 padding = 8 - padding
1244 index += chr(0)*padding
1245 assert len(index)%8 == 0
1246
1247 #Ugly bit of code to make a fake index at start
1248 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1249 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
1250 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1251 padding = len(index)%8
1252 if padding:
1253 padding = 8 - padding
1254 index += chr(0)*padding
1255 w = SffWriter(out_handle, index=False, xml=None)
1256 #Fake the header...
1257 w._number_of_reads = len(records)
1258 w._index_start = 0
1259 w._index_length = 0
1260 w._key_sequence = records[0].annotations["flow_key"]
1261 w._flow_chars = records[0].annotations["flow_chars"]
1262 w._number_of_flows_per_read = len(w._flow_chars)
1263 w.write_header()
1264 w._index_start = out_handle.tell()
1265 w._index_length = len(index)
1266 out_handle.seek(0)
1267 w.write_header() #this time with index info
1268 w.handle.write(index)
1269 for record in records:
1270 w.write_record(record)
1271 out_handle.close()
1272 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1273 for old, new in zip(records, records2):
1274 assert str(old.seq)==str(new.seq)
1275 i = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1276
1277 #Ugly bit of code to make a fake index in middle
1278 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1279 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
1280 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1281 padding = len(index)%8
1282 if padding:
1283 padding = 8 - padding
1284 index += chr(0)*padding
1285 w = SffWriter(out_handle, index=False, xml=None)
1286 #Fake the header...
1287 w._number_of_reads = len(records)
1288 w._index_start = 0
1289 w._index_length = 0
1290 w._key_sequence = records[0].annotations["flow_key"]
1291 w._flow_chars = records[0].annotations["flow_chars"]
1292 w._number_of_flows_per_read = len(w._flow_chars)
1293 w.write_header()
1294 for record in records[:5]:
1295 w.write_record(record)
1296 w._index_start = out_handle.tell()
1297 w._index_length = len(index)
1298 w.handle.write(index)
1299 for record in records[5:]:
1300 w.write_record(record)
1301 out_handle.seek(0)
1302 w.write_header() #this time with index info
1303 out_handle.close()
1304 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1305 for old, new in zip(records, records2):
1306 assert str(old.seq)==str(new.seq)
1307 j = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1308
1309 #Ugly bit of code to make a fake index at end
1310 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1311 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
1312 w = SffWriter(out_handle, index=False, xml=None)
1313 #Fake the header...
1314 w._number_of_reads = len(records)
1315 w._index_start = 0
1316 w._index_length = 0
1317 w._key_sequence = records[0].annotations["flow_key"]
1318 w._flow_chars = records[0].annotations["flow_chars"]
1319 w._number_of_flows_per_read = len(w._flow_chars)
1320 w.write_header()
1321 for record in records:
1322 w.write_record(record)
1323 w._index_start = out_handle.tell()
1324 w._index_length = len(index)
1325 out_handle.write(index)
1326 out_handle.seek(0)
1327 w.write_header() #this time with index info
1328 out_handle.close()
1329 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1330 for old, new in zip(records, records2):
1331 assert str(old.seq)==str(new.seq)
1332 try:
1333 print ReadRocheXmlManifest(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
1334 assert False, "Should fail!"
1335 except ValueError:
1336 pass
1337 k = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1338 """
1339
1340 print "Done"
1341