1
2
3
4
5 """Dictionary like indexing of sequence files (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.index(...) function which is the
9 public interface for this functionality.
10
11 The basic idea is that we scan over a sequence file, looking for new record
12 markers. We then try and extract the string that Bio.SeqIO.parse/read would
13 use as the record id, ideally without actually parsing the full record. We
14 then use a subclassed Python dictionary to record the file offset for the
15 record start against the record id.
16
17 Note that this means full parsing is on demand, so any invalid or problem
18 record may not trigger an exception until it is accessed. This is by design.
19
20 This means our dictionary like objects have in memory ALL the keys (all the
21 record identifiers), which shouldn't be a problem even with second generation
22 sequencing. If this is an issue later on, storing the keys and offsets in a
23 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
24 """
25
26 import os
27 try:
28 from collections import UserDict as _dict_base
29 except ImportError:
30 from UserDict import DictMixin as _dict_base
31 import re
32 import itertools
33 from StringIO import StringIO
34
35 try:
36 from sqlite3 import dbapi2 as _sqlite
37 from sqlite3 import IntegrityError as _IntegrityError
38 from sqlite3 import OperationalError as _OperationalError
39 except ImportError:
40
41
42 _sqlite = None
43 pass
44
45 from Bio._py3k import _bytes_to_string, _as_bytes, _as_string
46
47 from Bio import SeqIO
48 from Bio import Alphabet
49
51 """Read only dictionary interface to a sequential sequence file.
52
53 Keeps the keys and associated file offsets in memory, reads the file to
54 access entries as SeqRecord objects using Bio.SeqIO for parsing them.
55 This approach is memory limited, but will work even with millions of
56 sequences.
57
58 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys
59 (record identifiers by default) are not allowed. If this happens,
60 a ValueError exception is raised.
61
62 By default the SeqRecord's id string is used as the dictionary
63 key. This can be changed by suppling an optional key_function,
64 a callback function which will be given the record id and must
65 return the desired key. For example, this allows you to parse
66 NCBI style FASTA identifiers, and extract the GI number to use
67 as the dictionary key.
68
69 Note that this dictionary is essentially read only. You cannot
70 add or change values, pop values, nor clear the dictionary.
71 """
72 - def __init__(self, filename, format, alphabet, key_function):
73
74 try:
75 proxy_class = _FormatToRandomAccess[format]
76 except KeyError:
77 raise ValueError("Unsupported format '%s'" % format)
78 random_access_proxy = proxy_class(filename, format, alphabet)
79 self._proxy = random_access_proxy
80 self._key_function = key_function
81 if key_function:
82 offset_iter = ((key_function(k),o,l) for (k,o,l) in random_access_proxy)
83 else:
84 offset_iter = random_access_proxy
85 offsets = {}
86 for key, offset, length in offset_iter:
87
88
89
90 if key in offsets:
91 raise ValueError("Duplicate key '%s'" % key)
92 else:
93 offsets[key] = offset
94 self._offsets = offsets
95
97 return "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \
98 % (self._proxy._handle.name, self._proxy._format,
99 self._proxy._alphabet, self._key_function)
100
102 if self:
103 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0])
104 else:
105 return "{}"
106
108 return key in self._offsets
109
111 """How many records are there?"""
112 return len(self._offsets)
113
114 if hasattr(dict, "iteritems"):
115
117 """Would be a list of the SeqRecord objects, but not implemented.
118
119 In general you can be indexing very very large files, with millions
120 of sequences. Loading all these into memory at once as SeqRecord
121 objects would (probably) use up all the RAM. Therefore we simply
122 don't support this dictionary method.
123 """
124 raise NotImplementedError("Due to memory concerns, when indexing a "
125 "sequence file you cannot access all the "
126 "records at once.")
127
129 """Would be a list of the (key, SeqRecord) tuples, but not implemented.
130
131 In general you can be indexing very very large files, with millions
132 of sequences. Loading all these into memory at once as SeqRecord
133 objects would (probably) use up all the RAM. Therefore we simply
134 don't support this dictionary method.
135 """
136 raise NotImplementedError("Due to memory concerns, when indexing a "
137 "sequence file you cannot access all the "
138 "records at once.")
139
141 """Return a list of all the keys (SeqRecord identifiers)."""
142
143 return self._offsets.keys()
144
146 """Iterate over the SeqRecord) items."""
147 for key in self.__iter__():
148 yield self.__getitem__(key)
149
151 """Iterate over the (key, SeqRecord) items."""
152 for key in self.__iter__():
153 yield key, self.__getitem__(key)
154
156 """Iterate over the keys."""
157 return self.__iter__()
158
159 else:
160
162 """Iterate over the (key, SeqRecord) items."""
163 for key in self.__iter__():
164 yield key, self.__getitem__(key)
165
167 """Iterate over the SeqRecord items."""
168 for key in self.__iter__():
169 yield self.__getitem__(key)
170
172 """Iterate over the keys."""
173 return self.__iter__()
174
176 """Iterate over the keys."""
177 return iter(self._offsets)
178
180 """x.__getitem__(y) <==> x[y]"""
181
182 record = self._proxy.get(self._offsets[key])
183 if self._key_function:
184 key2 = self._key_function(record.id)
185 else:
186 key2 = record.id
187 if key != key2:
188 raise ValueError("Key did not match (%s vs %s)" % (key, key2))
189 return record
190
191 - def get(self, k, d=None):
192 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
193 try:
194 return self.__getitem__(k)
195 except KeyError:
196 return d
197
199 """Similar to the get method, but returns the record as a raw string.
200
201 If the key is not found, a KeyError exception is raised.
202
203 Note that on Python 3 a bytes string is returned, not a typical
204 unicode string.
205
206 NOTE - This functionality is not supported for every file format.
207 """
208
209 return self._proxy.get_raw(self._offsets[key])
210
212 """Would allow setting or replacing records, but not implemented."""
213 raise NotImplementedError("An indexed a sequence file is read only.")
214
216 """Would allow adding more values, but not implemented."""
217 raise NotImplementedError("An indexed a sequence file is read only.")
218
219
220 - def pop(self, key, default=None):
221 """Would remove specified record, but not implemented."""
222 raise NotImplementedError("An indexed a sequence file is read only.")
223
225 """Would remove and return a SeqRecord, but not implemented."""
226 raise NotImplementedError("An indexed a sequence file is read only.")
227
228
230 """Would clear dictionary, but not implemented."""
231 raise NotImplementedError("An indexed a sequence file is read only.")
232
234 """A dictionary method which we don't implement."""
235 raise NotImplementedError("An indexed a sequence file doesn't "
236 "support this.")
237
239 """A dictionary method which we don't implement."""
240 raise NotImplementedError("An indexed a sequence file doesn't "
241 "support this.")
242
243
245 """Read only dictionary interface to many sequential sequence files.
246
247 Keeps the keys, file-numbers and offsets in an SQLite database. To access
248 a record by key, reads from the offset in the approapriate file using
249 Bio.SeqIO for parsing.
250
251 There are OS limits on the number of files that can be open at once,
252 so a pool are kept. If a record is required from a closed file, then
253 one of the open handles is closed first.
254 """
255 - def __init__(self, index_filename, filenames, format, alphabet,
256 key_function, max_open=10):
257 random_access_proxies = {}
258
259
260
261
262 if not _sqlite:
263
264 from Bio import MissingPythonDependencyError
265 raise MissingPythonDependencyError("Requires sqlite3, which is "
266 "included Python 2.5+")
267 if filenames is not None:
268 filenames = list(filenames)
269 if os.path.isfile(index_filename):
270
271 con = _sqlite.connect(index_filename)
272 self._con = con
273
274 try:
275 count, = con.execute("SELECT value FROM meta_data WHERE key=?;",
276 ("count",)).fetchone()
277 self._length = int(count)
278 if self._length == -1:
279 raise ValueError("Unfinished/partial database")
280 count, = con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()
281 if self._length <> int(count):
282 raise ValueError("Corrupt database? %i entries not %i" \
283 % (int(count), self._length))
284 self._format, = con.execute("SELECT value FROM meta_data WHERE key=?;",
285 ("format",)).fetchone()
286 if format and format != self._format:
287 raise ValueError("Index file says format %s, not %s" \
288 % (self._format, format))
289 self._filenames = [row[0] for row in \
290 con.execute("SELECT name FROM file_data "
291 "ORDER BY file_number;").fetchall()]
292 if filenames and len(filenames) != len(self._filenames):
293 raise ValueError("Index file says %i files, not %i" \
294 % (len(self.filenames) != len(filenames)))
295 if filenames and filenames != self._filenames:
296 raise ValueError("Index file has different filenames")
297 except _OperationalError, err:
298 raise ValueError("Not a Biopython index database? %s" % err)
299
300 try:
301 proxy_class = _FormatToRandomAccess[self._format]
302 except KeyError:
303 raise ValueError("Unsupported format '%s'" % self._format)
304 else:
305 self._filenames = filenames
306 self._format = format
307 if not format or not filenames:
308 raise ValueError("Filenames to index and format required")
309 try:
310 proxy_class = _FormatToRandomAccess[format]
311 except KeyError:
312 raise ValueError("Unsupported format '%s'" % format)
313
314 con = _sqlite.connect(index_filename)
315 self._con = con
316
317
318 con.execute("PRAGMA synchronous='OFF'")
319 con.execute("PRAGMA locking_mode=EXCLUSIVE")
320
321
322
323 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
324 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
325 ("count", -1))
326 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
327 ("format", format))
328
329
330 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
331 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);")
332 count = 0
333 for i, filename in enumerate(filenames):
334 con.execute("INSERT INTO file_data (file_number, name) VALUES (?,?);",
335 (i, filename))
336 random_access_proxy = proxy_class(filename, format, alphabet)
337 if key_function:
338 offset_iter = ((key_function(k),i,o,l) for (k,o,l) in random_access_proxy)
339 else:
340 offset_iter = ((k,i,o,l) for (k,o,l) in random_access_proxy)
341 while True:
342 batch = list(itertools.islice(offset_iter, 100))
343 if not batch: break
344
345
346 con.executemany("INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
347 batch)
348 con.commit()
349 count += len(batch)
350 if len(random_access_proxies) < max_open:
351 random_access_proxies[i] = random_access_proxy
352 else:
353 random_access_proxy._handle.close()
354 self._length = count
355
356 try:
357 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS "
358 "key_index ON offset_data(key);")
359 except _IntegrityError, err:
360 raise ValueError("Duplicate key? %s" % err)
361 con.execute("PRAGMA locking_mode=NORMAL")
362 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;",
363 (count, "count"))
364 con.commit()
365
366 self._proxies = random_access_proxies
367 self._max_open = max_open
368 self._index_filename = index_filename
369 self._alphabet = alphabet
370 self._key_function = key_function
371
373 return "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \
374 % (self._index_filename, self._filenames, self._format,
375 self._alphabet, self._key_function)
376
378 return bool(self._con.execute("SELECT key FROM offset_data WHERE key=?;",
379 (key,)).fetchone())
380
382 """How many records are there?"""
383 return self._length
384
385
387 """Iterate over the keys."""
388 for row in self._con.execute("SELECT key FROM offset_data;"):
389 yield str(row[0])
390
391 if hasattr(dict, "iteritems"):
392
393
395 """Return a list of all the keys (SeqRecord identifiers)."""
396 return [str(row[0]) for row in \
397 self._con.execute("SELECT key FROM offset_data;").fetchall()]
398
400 """x.__getitem__(y) <==> x[y]"""
401
402 row = self._con.execute("SELECT file_number, offset FROM offset_data WHERE key=?;",
403 (key,)).fetchone()
404 if not row: raise KeyError
405 file_number, offset = row
406 proxies = self._proxies
407 if file_number in proxies:
408 record = proxies[file_number].get(offset)
409 else:
410 if len(proxies) >= self._max_open:
411
412 proxies.popitem()[1]._handle.close()
413
414 proxy = _FormatToRandomAccess[self._format]( \
415 self._filenames[file_number],
416 self._format, self._alphabet)
417 record = proxy.get(offset)
418 proxies[file_number] = proxy
419 if self._key_function:
420 key2 = self._key_function(record.id)
421 else:
422 key2 = record.id
423 if key != key2:
424 raise ValueError("Key did not match (%s vs %s)" % (key, key2))
425 return record
426
427 - def get(self, k, d=None):
428 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
429 try:
430 return self.__getitem__(k)
431 except KeyError:
432 return d
433
435 """Similar to the get method, but returns the record as a raw string.
436
437 If the key is not found, a KeyError exception is raised.
438
439 Note that on Python 3 a bytes string is returned, not a typical
440 unicode string.
441
442 NOTE - This functionality is not supported for every file format.
443 """
444
445 row = self._con.execute("SELECT file_number, offset, length FROM offset_data WHERE key=?;",
446 (key,)).fetchone()
447 if not row: raise KeyError
448 file_number, offset, length = row
449 proxies = self._proxies
450 if file_number in proxies:
451 if length:
452
453 h = proxies[file_number]._handle
454 h.seek(offset)
455 return h.read(length)
456 else:
457 return proxies[file_number].get_raw(offset)
458 else:
459
460 if len(proxies) >= self._max_open:
461
462 proxies.popitem()[1]._handle.close()
463
464 proxy = _FormatToRandomAccess[self._format]( \
465 self._filenames[file_number],
466 self._format, self._alphabet)
467 proxies[file_number] = proxy
468 if length:
469
470 h = proxy._handle
471 h.seek(offset)
472 return h.read(length)
473 else:
474 return proxy.get_raw(offset)
475
477 """Close any open file handles."""
478 proxies = self._proxies
479 while proxies:
480 proxies.popitem()[1]._handle.close()
481
482
483
484
486 - def __init__(self, filename, format, alphabet):
487 self._handle = open(filename, "rb")
488 self._alphabet = alphabet
489 self._format = format
490
491
492 i = SeqIO._FormatToIterator[format]
493
494
495 if alphabet is None:
496 def _parse(handle):
497 """Dynamically generated parser function (PRIVATE)."""
498 return i(handle).next()
499 else:
500
501 def _parse(handle):
502 """Dynamically generated parser function (PRIVATE)."""
503 try:
504 return i(handle, alphabet=alphabet).next()
505 except TypeError:
506 return SeqIO._force_alphabet(i(handle),
507 alphabet).next()
508 self._parse = _parse
509
511 """Returns (id,offset) tuples."""
512 raise NotImplementedError("Subclass should implement this")
513
514 - def get(self, offset):
518
520 """Returns bytes string (if implemented for this file format)."""
521
522 raise NotImplementedError("Not available for this file format.")
523
524
525
526
527
528
529
530
531
532
533
534
535
537 """Random access to a Standard Flowgram Format (SFF) file."""
538 - def __init__(self, filename, format, alphabet):
543
545 """Load any index block in the file, or build it the slow way (PRIVATE)."""
546 if self._alphabet is None:
547 self._alphabet = Alphabet.generic_dna
548 handle = self._handle
549 handle.seek(0)
550
551 header_length, index_offset, index_length, number_of_reads, \
552 self._flows_per_read, self._flow_chars, self._key_sequence \
553 = SeqIO.SffIO._sff_file_header(handle)
554 if index_offset and index_length:
555
556 count = 0
557 try :
558 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) :
559 yield name, offset, 0
560 count += 1
561 assert count == number_of_reads, \
562 "Indexed %i records, expected %i" \
563 % (count, number_of_reads)
564 return
565 except ValueError, err :
566 import warnings
567 warnings.warn("Could not parse the SFF index: %s" % err)
568 assert count==0, "Partially populated index"
569 handle.seek(0)
570 else :
571
572 import warnings
573 warnings.warn("No SFF index, doing it the slow way")
574
575 count = 0
576 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) :
577 yield name, offset, 0
578 count += 1
579 assert count == number_of_reads, \
580 "Indexed %i records, expected %i" % (count, number_of_reads)
581
582 - def get(self, offset) :
590
595
596
598 - def get(self, offset) :
607
608
609
610
611
612
614 - def __init__(self, filename, format, alphabet):
615 SeqFileRandomAccess.__init__(self, filename, format, alphabet)
616 marker = {"ace" : "CO ",
617 "embl" : "ID ",
618 "fasta" : ">",
619 "genbank" : "LOCUS ",
620 "gb": "LOCUS ",
621 "imgt" : "ID ",
622 "phd" : "BEGIN_SEQUENCE",
623 "pir" : ">..;",
624 "qual": ">",
625 "qual": ">",
626 "swiss" : "ID ",
627 "uniprot-xml" : "<entry ",
628 }[format]
629 self._marker = marker
630 self._marker_re = re.compile(_as_bytes("^%s" % marker))
631
633 """Returns (id,offset) tuples."""
634 marker_offset = len(self._marker)
635 marker_re = self._marker_re
636 handle = self._handle
637 handle.seek(0)
638
639 while True:
640 start_offset = handle.tell()
641 line = handle.readline()
642 if marker_re.match(line) or not line:
643 break
644
645 while marker_re.match(line):
646
647
648 id = line[marker_offset:].strip().split(None, 1)[0]
649 while True:
650 line = handle.readline()
651 if marker_re.match(line) or not line:
652 end_offset = handle.tell() - len(line)
653 yield _bytes_to_string(id), start_offset, end_offset - start_offset
654 start_offset = end_offset
655 break
656 assert not line, repr(line)
657
659 """Similar to the get method, but returns the record as a raw string."""
660
661 handle = self._handle
662 marker_re = self._marker_re
663 handle.seek(offset)
664 lines = [handle.readline()]
665 while True:
666 line = handle.readline()
667 if marker_re.match(line) or not line:
668
669 break
670 lines.append(line)
671 return _as_bytes("").join(lines)
672
673
674
675
676
677
679 """Indexed dictionary like access to a GenBank file."""
681 handle = self._handle
682 handle.seek(0)
683 marker_re = self._marker_re
684 dot_char = _as_bytes(".")
685 accession_marker = _as_bytes("ACCESSION ")
686 version_marker = _as_bytes("VERSION ")
687
688 while True:
689 start_offset = handle.tell()
690 line = handle.readline()
691 if marker_re.match(line) or not line:
692 break
693
694 while marker_re.match(line):
695
696
697 key = None
698 while True:
699 line = handle.readline()
700 if marker_re.match(line) or not line:
701 if not key:
702 raise ValueError("Did not find ACCESSION/VERSION lines")
703 end_offset = handle.tell() - len(line)
704 yield _bytes_to_string(key), start_offset, end_offset - start_offset
705 start_offset = end_offset
706 break
707 elif line.startswith(accession_marker):
708 key = line.rstrip().split()[1]
709 elif line.startswith(version_marker):
710 version_id = line.rstrip().split()[1]
711 if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit():
712
713 key = version_id
714 assert not line, repr(line)
715
716
718 """Indexed dictionary like access to an EMBL file."""
720 handle = self._handle
721 handle.seek(0)
722 marker_re = self._marker_re
723 semi_char = _as_bytes(";")
724 dot_char = _as_bytes(".")
725 sv_marker = _as_bytes("SV ")
726
727 while True:
728 start_offset = handle.tell()
729 line = handle.readline()
730 if marker_re.match(line) or not line:
731 break
732
733 while marker_re.match(line):
734
735
736 if line[2:].count(semi_char) == 6:
737
738 parts = line[3:].rstrip().split(semi_char)
739 if parts[1].strip().startswith(sv_marker):
740
741 key = parts[0].strip() + dot_char + parts[1].strip().split()[1]
742 else:
743 key = parts[0].strip()
744 elif line[2:].count(semi_char) == 3:
745
746 key = line[3:].strip().split(None,1)[0]
747 else:
748 raise ValueError('Did not recognise the ID line layout:\n' + line)
749 while True:
750 line = handle.readline()
751 if marker_re.match(line) or not line:
752 end_offset = handle.tell() - len(line)
753 yield _bytes_to_string(key), start_offset, end_offset - start_offset
754 start_offset = end_offset
755 break
756 elif line.startswith(sv_marker):
757 key = line.rstrip().split()[1]
758 assert not line, repr(line)
759
760
762 """Random access to a SwissProt file."""
789
790
792 """Random access to a UniProt XML file."""
794 handle = self._handle
795 handle.seek(0)
796 marker_re = self._marker_re
797 start_acc_marker = _as_bytes("<accession>")
798 end_acc_marker = _as_bytes("</accession>")
799 end_entry_marker = _as_bytes("</entry>")
800
801 while True:
802 start_offset = handle.tell()
803 line = handle.readline()
804 if marker_re.match(line) or not line:
805 break
806
807 while marker_re.match(line):
808
809
810
811 key = None
812 done = False
813 while True:
814 line = handle.readline()
815 if key is None and start_acc_marker in line:
816 assert end_acc_marker in line, line
817 key = line[line.find(start_acc_marker)+11:].split(_as_bytes("<"))[0]
818 elif end_entry_marker in line:
819 end_offset = handle.tell() - len(line) \
820 + line.find(end_entry_marker) + 8
821 break
822 elif marker_re.match(line) or not line:
823
824 raise ValueError("Didn't find end of record")
825 if not key:
826 raise ValueError("Did not find <accession> line in bytes %i to %i" \
827 % (start_offset, end_offset))
828 yield _bytes_to_string(key), start_offset, end_offset - start_offset
829
830 while not marker_re.match(line) and line:
831 start_offset = handle.tell()
832 line = handle.readline()
833 assert not line, repr(line)
834
836 """Similar to the get method, but returns the record as a raw string."""
837 handle = self._handle
838 marker_re = self._marker_re
839 end_entry_marker = _as_bytes("</entry>")
840 handle.seek(offset)
841 data = handle.readline()
842 while True:
843 line = handle.readline()
844 i = line.find(end_entry_marker)
845 if i != -1:
846 data += line[:i+8]
847 break
848 if marker_re.match(line) or not line:
849
850 raise ValueError("Didn't find end of record")
851 data += line
852 return data
853
854 - def get(self, offset) :
855
856
857
858 data = """<?xml version='1.0' encoding='UTF-8'?>
859 <uniprot xmlns="http://uniprot.org/uniprot"
860 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
861 xsi:schemaLocation="http://uniprot.org/uniprot
862 http://www.uniprot.org/support/docs/uniprot.xsd">
863 %s
864 </uniprot>
865 """ % _bytes_to_string(self.get_raw(offset))
866
867 return SeqIO.UniprotIO.UniprotIterator(data).next()
868
869
871 """Random access to a IntelliGenetics file."""
872 - def __init__(self, filename, format, alphabet):
875
877 handle = self._handle
878 handle.seek(0)
879 marker_re = self._marker_re
880 semi_char = _as_bytes(";")
881 while True:
882 offset = handle.tell()
883 line = handle.readline()
884 if marker_re.match(line):
885
886 while True:
887 line = handle.readline()
888 if line[0:1] != semi_char and line.strip():
889 key = line.split()[0]
890 yield _bytes_to_string(key), offset, 0
891 break
892 if not line:
893 raise ValueError("Premature end of file?")
894 elif not line:
895
896 break
897
898
913
915 """Random access to a simple tabbed file."""
917 handle = self._handle
918 handle.seek(0)
919 start_offset = handle.tell()
920 tab_char = _as_bytes("\t")
921 while True:
922 line = handle.readline()
923 if not line : break
924 try:
925 key = line.split(tab_char)[0]
926 except ValueError, err:
927 if not line.strip():
928
929 start_offset = handle.tell()
930 continue
931 else:
932 raise err
933 else:
934 end_offset = handle.tell()
935 yield _bytes_to_string(key), start_offset, end_offset - start_offset
936 start_offset = end_offset
937
943
944
945
946
947
948
950 """Random access to a FASTQ file (any supported variant).
951
952 With FASTQ the records all start with a "@" line, but so can quality lines.
953 Note this will cope with line-wrapped FASTQ files.
954 """
956 handle = self._handle
957 handle.seek(0)
958 id = None
959 start_offset = handle.tell()
960 line = handle.readline()
961 if not line:
962
963 return
964 at_char = _as_bytes("@")
965 plus_char = _as_bytes("+")
966 if line[0:1] != at_char:
967 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
968 while line:
969
970
971 id = line[1:].rstrip().split(None, 1)[0]
972
973 seq_len = 0
974 while line:
975 line = handle.readline()
976 if line.startswith(plus_char) : break
977 seq_len += len(line.strip())
978 if not line:
979 raise ValueError("Premature end of file in seq section")
980
981
982 qual_len = 0
983 while line:
984 if seq_len == qual_len:
985
986 line = handle.readline()
987 if line and line[0:1] != at_char:
988 ValueError("Problem with line %s" % repr(line))
989 break
990 else:
991 line = handle.readline()
992 qual_len += len(line.strip())
993 if seq_len != qual_len:
994 raise ValueError("Problem with quality section")
995 end_offset = handle.tell() - len(line)
996 yield _bytes_to_string(id), start_offset, end_offset - start_offset
997 start_offset = end_offset
998
999
1001 """Similar to the get method, but returns the record as a raw string."""
1002
1003 handle = self._handle
1004 handle.seek(offset)
1005 line = handle.readline()
1006 data = line
1007 at_char = _as_bytes("@")
1008 plus_char = _as_bytes("+")
1009 if line[0:1] != at_char:
1010 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
1011 identifier = line[1:].rstrip().split(None, 1)[0]
1012
1013 seq_len = 0
1014 while line:
1015 line = handle.readline()
1016 data += line
1017 if line.startswith(plus_char) : break
1018 seq_len += len(line.strip())
1019 if not line:
1020 raise ValueError("Premature end of file in seq section")
1021 assert line[0:1] == plus_char
1022
1023 qual_len = 0
1024 while line:
1025 if seq_len == qual_len:
1026
1027 pos = handle.tell()
1028 line = handle.readline()
1029 if line and line[0:1] != at_char:
1030 ValueError("Problem with line %s" % repr(line))
1031 break
1032 else:
1033 line = handle.readline()
1034 data += line
1035 qual_len += len(line.strip())
1036 if seq_len != qual_len:
1037 raise ValueError("Problem with quality section")
1038 return data
1039
1040
1041
1042
1043 _FormatToRandomAccess = {"ace" : SequentialSeqFileRandomAccess,
1044 "embl" : EmblRandomAccess,
1045 "fasta" : SequentialSeqFileRandomAccess,
1046 "fastq" : FastqRandomAccess,
1047 "fastq-sanger" : FastqRandomAccess,
1048 "fastq-solexa" : FastqRandomAccess,
1049 "fastq-illumina" : FastqRandomAccess,
1050 "genbank" : GenBankRandomAccess,
1051 "gb" : GenBankRandomAccess,
1052 "ig" : IntelliGeneticsRandomAccess,
1053 "imgt" : EmblRandomAccess,
1054 "phd" : SequentialSeqFileRandomAccess,
1055 "pir" : SequentialSeqFileRandomAccess,
1056 "sff" : SffRandomAccess,
1057 "sff-trim" : SffTrimedRandomAccess,
1058 "swiss" : SwissRandomAccess,
1059 "tab" : TabRandomAccess,
1060 "qual" : SequentialSeqFileRandomAccess,
1061 "uniprot-xml" : UniprotRandomAccess,
1062 }
1063