Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

   1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  """Dictionary like indexing of sequence files (PRIVATE). 
   6   
   7  You are not expected to access this module, or any of its code, directly. This 
   8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
   9  public interface for this functionality. 
  10   
  11  The basic idea is that we scan over a sequence file, looking for new record 
  12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
  13  use as the record id, ideally without actually parsing the full record. We 
  14  then use a subclassed Python dictionary to record the file offset for the 
  15  record start against the record id. 
  16   
  17  Note that this means full parsing is on demand, so any invalid or problem 
  18  record may not trigger an exception until it is accessed. This is by design. 
  19   
  20  This means our dictionary like objects have in memory ALL the keys (all the 
  21  record identifiers), which shouldn't be a problem even with second generation 
  22  sequencing. If this is an issue later on, storing the keys and offsets in a 
  23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
  24  """ 
  25   
  26  import os 
  27  try: 
  28      from collections import UserDict as _dict_base 
  29  except ImportError: 
  30      from UserDict import DictMixin as _dict_base 
  31  import re 
  32  import itertools 
  33  from StringIO import StringIO 
  34   
  35  try: 
  36      from sqlite3 import dbapi2 as _sqlite 
  37      from sqlite3 import IntegrityError as _IntegrityError 
  38      from sqlite3 import OperationalError as _OperationalError 
  39  except ImportError: 
  40      #Not expected to be present on Python 2.4, ignore it 
  41      #and at least offer Bio.SeqIO.index() functionality 
  42      _sqlite = None 
  43      pass 
  44   
  45  from Bio._py3k import _bytes_to_string, _as_bytes, _as_string 
  46   
  47  from Bio import SeqIO 
  48  from Bio import Alphabet 
  49   
50 -class _IndexedSeqFileDict(_dict_base):
51 """Read only dictionary interface to a sequential sequence file. 52 53 Keeps the keys and associated file offsets in memory, reads the file to 54 access entries as SeqRecord objects using Bio.SeqIO for parsing them. 55 This approach is memory limited, but will work even with millions of 56 sequences. 57 58 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys 59 (record identifiers by default) are not allowed. If this happens, 60 a ValueError exception is raised. 61 62 By default the SeqRecord's id string is used as the dictionary 63 key. This can be changed by suppling an optional key_function, 64 a callback function which will be given the record id and must 65 return the desired key. For example, this allows you to parse 66 NCBI style FASTA identifiers, and extract the GI number to use 67 as the dictionary key. 68 69 Note that this dictionary is essentially read only. You cannot 70 add or change values, pop values, nor clear the dictionary. 71 """
72 - def __init__(self, filename, format, alphabet, key_function):
73 #Use key_function=None for default value 74 try: 75 proxy_class = _FormatToRandomAccess[format] 76 except KeyError: 77 raise ValueError("Unsupported format '%s'" % format) 78 random_access_proxy = proxy_class(filename, format, alphabet) 79 self._proxy = random_access_proxy 80 self._key_function = key_function 81 if key_function: 82 offset_iter = ((key_function(k),o,l) for (k,o,l) in random_access_proxy) 83 else: 84 offset_iter = random_access_proxy 85 offsets = {} 86 for key, offset, length in offset_iter: 87 #Note - we don't store the length because I want to minimise the 88 #memory requirements. With the SQLite backend the length is kept 89 #and is used to speed up the get_raw method (by about 3 times). 90 if key in offsets: 91 raise ValueError("Duplicate key '%s'" % key) 92 else: 93 offsets[key] = offset 94 self._offsets = offsets
95
96 - def __repr__(self):
97 return "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \ 98 % (self._proxy._handle.name, self._proxy._format, 99 self._proxy._alphabet, self._key_function)
100
101 - def __str__(self):
102 if self: 103 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0]) 104 else: 105 return "{}"
106
107 - def __contains__(self, key) :
108 return key in self._offsets
109
110 - def __len__(self):
111 """How many records are there?""" 112 return len(self._offsets)
113 114 if hasattr(dict, "iteritems"): 115 #Python 2, use iteritems but not items etc
116 - def values(self):
117 """Would be a list of the SeqRecord objects, but not implemented. 118 119 In general you can be indexing very very large files, with millions 120 of sequences. Loading all these into memory at once as SeqRecord 121 objects would (probably) use up all the RAM. Therefore we simply 122 don't support this dictionary method. 123 """ 124 raise NotImplementedError("Due to memory concerns, when indexing a " 125 "sequence file you cannot access all the " 126 "records at once.")
127
128 - def items(self):
129 """Would be a list of the (key, SeqRecord) tuples, but not implemented. 130 131 In general you can be indexing very very large files, with millions 132 of sequences. Loading all these into memory at once as SeqRecord 133 objects would (probably) use up all the RAM. Therefore we simply 134 don't support this dictionary method. 135 """ 136 raise NotImplementedError("Due to memory concerns, when indexing a " 137 "sequence file you cannot access all the " 138 "records at once.")
139
140 - def keys(self) :
141 """Return a list of all the keys (SeqRecord identifiers).""" 142 #TODO - Stick a warning in here for large lists? Or just refuse? 143 return self._offsets.keys()
144
145 - def itervalues(self):
146 """Iterate over the SeqRecord) items.""" 147 for key in self.__iter__(): 148 yield self.__getitem__(key)
149
150 - def iteritems(self):
151 """Iterate over the (key, SeqRecord) items.""" 152 for key in self.__iter__(): 153 yield key, self.__getitem__(key)
154
155 - def iterkeys(self):
156 """Iterate over the keys.""" 157 return self.__iter__()
158 159 else: 160 #Python 3 - define items and values as iterators
161 - def items(self):
162 """Iterate over the (key, SeqRecord) items.""" 163 for key in self.__iter__(): 164 yield key, self.__getitem__(key)
165
166 - def values(self):
167 """Iterate over the SeqRecord items.""" 168 for key in self.__iter__(): 169 yield self.__getitem__(key)
170
171 - def keys(self):
172 """Iterate over the keys.""" 173 return self.__iter__()
174
175 - def __iter__(self):
176 """Iterate over the keys.""" 177 return iter(self._offsets)
178
179 - def __getitem__(self, key):
180 """x.__getitem__(y) <==> x[y]""" 181 #Pass the offset to the proxy 182 record = self._proxy.get(self._offsets[key]) 183 if self._key_function: 184 key2 = self._key_function(record.id) 185 else: 186 key2 = record.id 187 if key != key2: 188 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 189 return record
190
191 - def get(self, k, d=None):
192 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 193 try: 194 return self.__getitem__(k) 195 except KeyError: 196 return d
197
198 - def get_raw(self, key):
199 """Similar to the get method, but returns the record as a raw string. 200 201 If the key is not found, a KeyError exception is raised. 202 203 Note that on Python 3 a bytes string is returned, not a typical 204 unicode string. 205 206 NOTE - This functionality is not supported for every file format. 207 """ 208 #Pass the offset to the proxy 209 return self._proxy.get_raw(self._offsets[key])
210
211 - def __setitem__(self, key, value):
212 """Would allow setting or replacing records, but not implemented.""" 213 raise NotImplementedError("An indexed a sequence file is read only.")
214
215 - def update(self, **kwargs):
216 """Would allow adding more values, but not implemented.""" 217 raise NotImplementedError("An indexed a sequence file is read only.")
218 219
220 - def pop(self, key, default=None):
221 """Would remove specified record, but not implemented.""" 222 raise NotImplementedError("An indexed a sequence file is read only.")
223
224 - def popitem(self):
225 """Would remove and return a SeqRecord, but not implemented.""" 226 raise NotImplementedError("An indexed a sequence file is read only.")
227 228
229 - def clear(self):
230 """Would clear dictionary, but not implemented.""" 231 raise NotImplementedError("An indexed a sequence file is read only.")
232
233 - def fromkeys(self, keys, value=None):
234 """A dictionary method which we don't implement.""" 235 raise NotImplementedError("An indexed a sequence file doesn't " 236 "support this.")
237
238 - def copy(self):
239 """A dictionary method which we don't implement.""" 240 raise NotImplementedError("An indexed a sequence file doesn't " 241 "support this.")
242 243
244 -class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
245 """Read only dictionary interface to many sequential sequence files. 246 247 Keeps the keys, file-numbers and offsets in an SQLite database. To access 248 a record by key, reads from the offset in the approapriate file using 249 Bio.SeqIO for parsing. 250 251 There are OS limits on the number of files that can be open at once, 252 so a pool are kept. If a record is required from a closed file, then 253 one of the open handles is closed first. 254 """
255 - def __init__(self, index_filename, filenames, format, alphabet, 256 key_function, max_open=10):
257 random_access_proxies = {} 258 #TODO? - Don't keep filename list in memory (just in DB)? 259 #Should save a chunk of memory if dealing with 1000s of files. 260 #Furthermore could compare a generator to the DB on reloading 261 #(no need to turn it into a list) 262 if not _sqlite: 263 #Hack for Python 2.4 (of if Python is compiled without it) 264 from Bio import MissingPythonDependencyError 265 raise MissingPythonDependencyError("Requires sqlite3, which is " 266 "included Python 2.5+") 267 if filenames is not None: 268 filenames = list(filenames) #In case it was a generator 269 if os.path.isfile(index_filename): 270 #Reuse the index. 271 con = _sqlite.connect(index_filename) 272 self._con = con 273 #Check the count... 274 try: 275 count, = con.execute("SELECT value FROM meta_data WHERE key=?;", 276 ("count",)).fetchone() 277 self._length = int(count) 278 if self._length == -1: 279 raise ValueError("Unfinished/partial database") 280 count, = con.execute("SELECT COUNT(key) FROM offset_data;").fetchone() 281 if self._length <> int(count): 282 raise ValueError("Corrupt database? %i entries not %i" \ 283 % (int(count), self._length)) 284 self._format, = con.execute("SELECT value FROM meta_data WHERE key=?;", 285 ("format",)).fetchone() 286 if format and format != self._format: 287 raise ValueError("Index file says format %s, not %s" \ 288 % (self._format, format)) 289 self._filenames = [row[0] for row in \ 290 con.execute("SELECT name FROM file_data " 291 "ORDER BY file_number;").fetchall()] 292 if filenames and len(filenames) != len(self._filenames): 293 raise ValueError("Index file says %i files, not %i" \ 294 % (len(self.filenames) != len(filenames))) 295 if filenames and filenames != self._filenames: 296 raise ValueError("Index file has different filenames") 297 except _OperationalError, err: 298 raise ValueError("Not a Biopython index database? %s" % err) 299 #Now we have the format (from the DB if not given to us), 300 try: 301 proxy_class = _FormatToRandomAccess[self._format] 302 except KeyError: 303 raise ValueError("Unsupported format '%s'" % self._format) 304 else: 305 self._filenames = filenames 306 self._format = format 307 if not format or not filenames: 308 raise ValueError("Filenames to index and format required") 309 try: 310 proxy_class = _FormatToRandomAccess[format] 311 except KeyError: 312 raise ValueError("Unsupported format '%s'" % format) 313 #Create the index 314 con = _sqlite.connect(index_filename) 315 self._con = con 316 #print "Creating index" 317 # Sqlite PRAGMA settings for speed 318 con.execute("PRAGMA synchronous='OFF'") 319 con.execute("PRAGMA locking_mode=EXCLUSIVE") 320 #Don't index the key column until the end (faster) 321 #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " 322 # "offset INTEGER);") 323 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 324 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 325 ("count", -1)) 326 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 327 ("format", format)) 328 #TODO - Record the alphabet? 329 #TODO - Record the file size and modified date? 330 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);") 331 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);") 332 count = 0 333 for i, filename in enumerate(filenames): 334 con.execute("INSERT INTO file_data (file_number, name) VALUES (?,?);", 335 (i, filename)) 336 random_access_proxy = proxy_class(filename, format, alphabet) 337 if key_function: 338 offset_iter = ((key_function(k),i,o,l) for (k,o,l) in random_access_proxy) 339 else: 340 offset_iter = ((k,i,o,l) for (k,o,l) in random_access_proxy) 341 while True: 342 batch = list(itertools.islice(offset_iter, 100)) 343 if not batch: break 344 #print "Inserting batch of %i offsets, %s ... %s" \ 345 # % (len(batch), batch[0][0], batch[-1][0]) 346 con.executemany("INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", 347 batch) 348 con.commit() 349 count += len(batch) 350 if len(random_access_proxies) < max_open: 351 random_access_proxies[i] = random_access_proxy 352 else: 353 random_access_proxy._handle.close() 354 self._length = count 355 #print "About to index %i entries" % count 356 try: 357 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS " 358 "key_index ON offset_data(key);") 359 except _IntegrityError, err: 360 raise ValueError("Duplicate key? %s" % err) 361 con.execute("PRAGMA locking_mode=NORMAL") 362 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", 363 (count, "count")) 364 con.commit() 365 #print "Index created" 366 self._proxies = random_access_proxies 367 self._max_open = max_open 368 self._index_filename = index_filename 369 self._alphabet = alphabet 370 self._key_function = key_function
371
372 - def __repr__(self):
373 return "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \ 374 % (self._index_filename, self._filenames, self._format, 375 self._alphabet, self._key_function)
376
377 - def __contains__(self, key):
378 return bool(self._con.execute("SELECT key FROM offset_data WHERE key=?;", 379 (key,)).fetchone())
380
381 - def __len__(self):
382 """How many records are there?""" 383 return self._length
384 #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] 385
386 - def __iter__(self):
387 """Iterate over the keys.""" 388 for row in self._con.execute("SELECT key FROM offset_data;"): 389 yield str(row[0])
390 391 if hasattr(dict, "iteritems"): 392 #Python 2, use iteritems but not items etc 393 #Just need to override this...
394 - def keys(self) :
395 """Return a list of all the keys (SeqRecord identifiers).""" 396 return [str(row[0]) for row in \ 397 self._con.execute("SELECT key FROM offset_data;").fetchall()]
398
399 - def __getitem__(self, key):
400 """x.__getitem__(y) <==> x[y]""" 401 #Pass the offset to the proxy 402 row = self._con.execute("SELECT file_number, offset FROM offset_data WHERE key=?;", 403 (key,)).fetchone() 404 if not row: raise KeyError 405 file_number, offset = row 406 proxies = self._proxies 407 if file_number in proxies: 408 record = proxies[file_number].get(offset) 409 else: 410 if len(proxies) >= self._max_open: 411 #Close an old handle... 412 proxies.popitem()[1]._handle.close() 413 #Open a new handle... 414 proxy = _FormatToRandomAccess[self._format]( \ 415 self._filenames[file_number], 416 self._format, self._alphabet) 417 record = proxy.get(offset) 418 proxies[file_number] = proxy 419 if self._key_function: 420 key2 = self._key_function(record.id) 421 else: 422 key2 = record.id 423 if key != key2: 424 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 425 return record
426
427 - def get(self, k, d=None):
428 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 429 try: 430 return self.__getitem__(k) 431 except KeyError: 432 return d
433
434 - def get_raw(self, key):
435 """Similar to the get method, but returns the record as a raw string. 436 437 If the key is not found, a KeyError exception is raised. 438 439 Note that on Python 3 a bytes string is returned, not a typical 440 unicode string. 441 442 NOTE - This functionality is not supported for every file format. 443 """ 444 #Pass the offset to the proxy 445 row = self._con.execute("SELECT file_number, offset, length FROM offset_data WHERE key=?;", 446 (key,)).fetchone() 447 if not row: raise KeyError 448 file_number, offset, length = row 449 proxies = self._proxies 450 if file_number in proxies: 451 if length: 452 #Shortcut if we have the length 453 h = proxies[file_number]._handle 454 h.seek(offset) 455 return h.read(length) 456 else: 457 return proxies[file_number].get_raw(offset) 458 else: 459 #This code is duplicated from __getitem__ to avoid a function call 460 if len(proxies) >= self._max_open: 461 #Close an old handle... 462 proxies.popitem()[1]._handle.close() 463 #Open a new handle... 464 proxy = _FormatToRandomAccess[self._format]( \ 465 self._filenames[file_number], 466 self._format, self._alphabet) 467 proxies[file_number] = proxy 468 if length: 469 #Shortcut if we have the length 470 h = proxy._handle 471 h.seek(offset) 472 return h.read(length) 473 else: 474 return proxy.get_raw(offset)
475
476 - def close(self):
477 """Close any open file handles.""" 478 proxies = self._proxies 479 while proxies: 480 proxies.popitem()[1]._handle.close()
481 482 483 ############################################################################## 484
485 -class SeqFileRandomAccess(object):
486 - def __init__(self, filename, format, alphabet):
487 self._handle = open(filename, "rb") 488 self._alphabet = alphabet 489 self._format = format 490 #Load the parser class/function once an avoid the dict lookup in each 491 #__getitem__ call: 492 i = SeqIO._FormatToIterator[format] 493 #The following alphabet code is a bit nasty... duplicates logic in 494 #Bio.SeqIO.parse() 495 if alphabet is None: 496 def _parse(handle): 497 """Dynamically generated parser function (PRIVATE).""" 498 return i(handle).next()
499 else: 500 #TODO - Detect alphabet support ONCE at __init__ 501 def _parse(handle): 502 """Dynamically generated parser function (PRIVATE).""" 503 try: 504 return i(handle, alphabet=alphabet).next() 505 except TypeError: 506 return SeqIO._force_alphabet(i(handle), 507 alphabet).next()
508 self._parse = _parse 509
510 - def __iter__(self):
511 """Returns (id,offset) tuples.""" 512 raise NotImplementedError("Subclass should implement this")
513
514 - def get(self, offset):
515 """Returns SeqRecord.""" 516 #Should be overriden for binary file formats etc: 517 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
518
519 - def get_raw(self, offset):
520 """Returns bytes string (if implemented for this file format).""" 521 #Should be done by each sub-class (if possible) 522 raise NotImplementedError("Not available for this file format.")
523 524 525 526 527 #################### 528 # Special indexers # 529 #################### 530 531 # Anything where the records cannot be read simply by parsing from 532 # the record start. For example, anything requiring information from 533 # a file header - e.g. SFF files where we would need to know the 534 # number of flows. 535
536 -class SffRandomAccess(SeqFileRandomAccess):
537 """Random access to a Standard Flowgram Format (SFF) file."""
538 - def __init__(self, filename, format, alphabet):
539 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 540 header_length, index_offset, index_length, number_of_reads, \ 541 self._flows_per_read, self._flow_chars, self._key_sequence \ 542 = SeqIO.SffIO._sff_file_header(self._handle)
543
544 - def __iter__(self):
545 """Load any index block in the file, or build it the slow way (PRIVATE).""" 546 if self._alphabet is None: 547 self._alphabet = Alphabet.generic_dna 548 handle = self._handle 549 handle.seek(0) 550 #Alread did this in __init__ but need handle in right place 551 header_length, index_offset, index_length, number_of_reads, \ 552 self._flows_per_read, self._flow_chars, self._key_sequence \ 553 = SeqIO.SffIO._sff_file_header(handle) 554 if index_offset and index_length: 555 #There is an index provided, try this the fast way: 556 count = 0 557 try : 558 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) : 559 yield name, offset, 0 560 count += 1 561 assert count == number_of_reads, \ 562 "Indexed %i records, expected %i" \ 563 % (count, number_of_reads) 564 return 565 except ValueError, err : 566 import warnings 567 warnings.warn("Could not parse the SFF index: %s" % err) 568 assert count==0, "Partially populated index" 569 handle.seek(0) 570 else : 571 #TODO - Remove this debug warning? 572 import warnings 573 warnings.warn("No SFF index, doing it the slow way") 574 #Fall back on the slow way! 575 count = 0 576 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) : 577 yield name, offset, 0 578 count += 1 579 assert count == number_of_reads, \ 580 "Indexed %i records, expected %i" % (count, number_of_reads)
581
582 - def get(self, offset) :
583 handle = self._handle 584 handle.seek(offset) 585 return SeqIO.SffIO._sff_read_seq_record(handle, 586 self._flows_per_read, 587 self._flow_chars, 588 self._key_sequence, 589 self._alphabet)
590
591 - def get_raw(self, offset):
592 handle = self._handle 593 handle.seek(offset) 594 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
595 596
597 -class SffTrimedRandomAccess(SffRandomAccess) :
598 - def get(self, offset) :
599 handle = self._handle 600 handle.seek(offset) 601 return SeqIO.SffIO._sff_read_seq_record(handle, 602 self._flows_per_read, 603 self._flow_chars, 604 self._key_sequence, 605 self._alphabet, 606 trim=True)
607 608 609 ################### 610 # Simple indexers # 611 ################### 612
613 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
614 - def __init__(self, filename, format, alphabet):
615 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 616 marker = {"ace" : "CO ", 617 "embl" : "ID ", 618 "fasta" : ">", 619 "genbank" : "LOCUS ", 620 "gb": "LOCUS ", 621 "imgt" : "ID ", 622 "phd" : "BEGIN_SEQUENCE", 623 "pir" : ">..;", 624 "qual": ">", 625 "qual": ">", 626 "swiss" : "ID ", 627 "uniprot-xml" : "<entry ", 628 }[format] 629 self._marker = marker 630 self._marker_re = re.compile(_as_bytes("^%s" % marker))
631
632 - def __iter__(self):
633 """Returns (id,offset) tuples.""" 634 marker_offset = len(self._marker) 635 marker_re = self._marker_re 636 handle = self._handle 637 handle.seek(0) 638 #Skip and header before first record 639 while True: 640 start_offset = handle.tell() 641 line = handle.readline() 642 if marker_re.match(line) or not line: 643 break 644 #Should now be at the start of a record, or end of the file 645 while marker_re.match(line): 646 #Here we can assume the record.id is the first word after the 647 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 648 id = line[marker_offset:].strip().split(None, 1)[0] 649 while True: 650 line = handle.readline() 651 if marker_re.match(line) or not line: 652 end_offset = handle.tell() - len(line) 653 yield _bytes_to_string(id), start_offset, end_offset - start_offset 654 start_offset = end_offset 655 break 656 assert not line, repr(line)
657
658 - def get_raw(self, offset):
659 """Similar to the get method, but returns the record as a raw string.""" 660 #For non-trivial file formats this must be over-ridden in the subclass 661 handle = self._handle 662 marker_re = self._marker_re 663 handle.seek(offset) 664 lines = [handle.readline()] 665 while True: 666 line = handle.readline() 667 if marker_re.match(line) or not line: 668 #End of file, or start of next record => end of this record 669 break 670 lines.append(line) 671 return _as_bytes("").join(lines)
672 673 674 ####################################### 675 # Fiddly indexers: GenBank, EMBL, ... # 676 ####################################### 677
678 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
679 """Indexed dictionary like access to a GenBank file."""
680 - def __iter__(self):
681 handle = self._handle 682 handle.seek(0) 683 marker_re = self._marker_re 684 dot_char = _as_bytes(".") 685 accession_marker = _as_bytes("ACCESSION ") 686 version_marker = _as_bytes("VERSION ") 687 #Skip and header before first record 688 while True: 689 start_offset = handle.tell() 690 line = handle.readline() 691 if marker_re.match(line) or not line: 692 break 693 #Should now be at the start of a record, or end of the file 694 while marker_re.match(line): 695 #We cannot assume the record.id is the first word after LOCUS, 696 #normally the first entry on the VERSION or ACCESSION line is used. 697 key = None 698 while True: 699 line = handle.readline() 700 if marker_re.match(line) or not line: 701 if not key: 702 raise ValueError("Did not find ACCESSION/VERSION lines") 703 end_offset = handle.tell() - len(line) 704 yield _bytes_to_string(key), start_offset, end_offset - start_offset 705 start_offset = end_offset 706 break 707 elif line.startswith(accession_marker): 708 key = line.rstrip().split()[1] 709 elif line.startswith(version_marker): 710 version_id = line.rstrip().split()[1] 711 if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit(): 712 #This should mimic the GenBank parser... 713 key = version_id 714 assert not line, repr(line)
715 716
717 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
718 """Indexed dictionary like access to an EMBL file."""
719 - def __iter__(self):
720 handle = self._handle 721 handle.seek(0) 722 marker_re = self._marker_re 723 semi_char = _as_bytes(";") 724 dot_char = _as_bytes(".") 725 sv_marker = _as_bytes("SV ") 726 #Skip any header before first record 727 while True: 728 start_offset = handle.tell() 729 line = handle.readline() 730 if marker_re.match(line) or not line: 731 break 732 #Should now be at the start of a record, or end of the file 733 while marker_re.match(line): 734 #We cannot assume the record.id is the first word after ID, 735 #normally the SV line is used. 736 if line[2:].count(semi_char) == 6: 737 #Looks like the semi colon separated style introduced in 2006 738 parts = line[3:].rstrip().split(semi_char) 739 if parts[1].strip().startswith(sv_marker): 740 #The SV bit gives the version 741 key = parts[0].strip() + dot_char + parts[1].strip().split()[1] 742 else: 743 key = parts[0].strip() 744 elif line[2:].count(semi_char) == 3: 745 #Looks like the pre 2006 style, take first word only 746 key = line[3:].strip().split(None,1)[0] 747 else: 748 raise ValueError('Did not recognise the ID line layout:\n' + line) 749 while True: 750 line = handle.readline() 751 if marker_re.match(line) or not line: 752 end_offset = handle.tell() - len(line) 753 yield _bytes_to_string(key), start_offset, end_offset - start_offset 754 start_offset = end_offset 755 break 756 elif line.startswith(sv_marker): 757 key = line.rstrip().split()[1] 758 assert not line, repr(line)
759 760
761 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
762 """Random access to a SwissProt file."""
763 - def __iter__(self):
764 handle = self._handle 765 handle.seek(0) 766 marker_re = self._marker_re 767 semi_char = _as_bytes(";") 768 #Skip any header before first record 769 while True: 770 start_offset = handle.tell() 771 line = handle.readline() 772 if marker_re.match(line) or not line: 773 break 774 #Should now be at the start of a record, or end of the file 775 while marker_re.match(line): 776 #We cannot assume the record.id is the first word after ID, 777 #normally the following AC line is used. 778 line = handle.readline() 779 assert line.startswith(_as_bytes("AC ")) 780 key = line[3:].strip().split(semi_char)[0].strip() 781 while True: 782 line = handle.readline() 783 if marker_re.match(line) or not line: 784 end_offset = handle.tell() - len(line) 785 yield _bytes_to_string(key), start_offset, end_offset - start_offset 786 start_offset = end_offset 787 break 788 assert not line, repr(line)
789 790
791 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
792 """Random access to a UniProt XML file."""
793 - def __iter__(self):
794 handle = self._handle 795 handle.seek(0) 796 marker_re = self._marker_re 797 start_acc_marker = _as_bytes("<accession>") 798 end_acc_marker = _as_bytes("</accession>") 799 end_entry_marker = _as_bytes("</entry>") 800 #Skip any header before first record 801 while True: 802 start_offset = handle.tell() 803 line = handle.readline() 804 if marker_re.match(line) or not line: 805 break 806 #Should now be at the start of a record, or end of the file 807 while marker_re.match(line): 808 #We expect the next line to be <accession>xxx</accession> 809 #(possibly with leading spaces) 810 #but allow it to be later on within the <entry> 811 key = None 812 done = False 813 while True: 814 line = handle.readline() 815 if key is None and start_acc_marker in line: 816 assert end_acc_marker in line, line 817 key = line[line.find(start_acc_marker)+11:].split(_as_bytes("<"))[0] 818 elif end_entry_marker in line: 819 end_offset = handle.tell() - len(line) \ 820 + line.find(end_entry_marker) + 8 821 break 822 elif marker_re.match(line) or not line: 823 #Start of next record or end of file 824 raise ValueError("Didn't find end of record") 825 if not key: 826 raise ValueError("Did not find <accession> line in bytes %i to %i" \ 827 % (start_offset, end_offset)) 828 yield _bytes_to_string(key), start_offset, end_offset - start_offset 829 #Find start of next record 830 while not marker_re.match(line) and line: 831 start_offset = handle.tell() 832 line = handle.readline() 833 assert not line, repr(line)
834
835 - def get_raw(self, offset):
836 """Similar to the get method, but returns the record as a raw string.""" 837 handle = self._handle 838 marker_re = self._marker_re 839 end_entry_marker = _as_bytes("</entry>") 840 handle.seek(offset) 841 data = handle.readline() 842 while True: 843 line = handle.readline() 844 i = line.find(end_entry_marker) 845 if i != -1: 846 data += line[:i+8] 847 break 848 if marker_re.match(line) or not line: 849 #End of file, or start of next record 850 raise ValueError("Didn't find end of record") 851 data += line 852 return data
853
854 - def get(self, offset) :
855 #TODO - Can we handle this directly in the parser? 856 #This is a hack - use get_raw for <entry>...</entry> and wrap it with 857 #the apparently required XML header and footer. 858 data = """<?xml version='1.0' encoding='UTF-8'?> 859 <uniprot xmlns="http://uniprot.org/uniprot" 860 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 861 xsi:schemaLocation="http://uniprot.org/uniprot 862 http://www.uniprot.org/support/docs/uniprot.xsd"> 863 %s 864 </uniprot> 865 """ % _bytes_to_string(self.get_raw(offset)) 866 #TODO - For consistency, this function should not accept a string: 867 return SeqIO.UniprotIO.UniprotIterator(data).next()
868 869
870 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
871 """Random access to a IntelliGenetics file."""
872 - def __init__(self, filename, format, alphabet):
873 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 874 self._marker_re = re.compile(_as_bytes("^;"))
875
876 - def __iter__(self):
877 handle = self._handle 878 handle.seek(0) 879 marker_re = self._marker_re 880 semi_char = _as_bytes(";") 881 while True: 882 offset = handle.tell() 883 line = handle.readline() 884 if marker_re.match(line): 885 #Now look for the first line which doesn't start ";" 886 while True: 887 line = handle.readline() 888 if line[0:1] != semi_char and line.strip(): 889 key = line.split()[0] 890 yield _bytes_to_string(key), offset, 0 891 break 892 if not line: 893 raise ValueError("Premature end of file?") 894 elif not line: 895 #End of file 896 break
897 898
899 - def get_raw(self, offset):
900 handle = self._handle 901 handle.seek(offset) 902 marker_re = self._marker_re 903 lines = [] 904 line = handle.readline() 905 semi_char = _as_bytes(";") 906 while line.startswith(semi_char): 907 lines.append(line) 908 line = handle.readline() 909 while line and not line.startswith(semi_char): 910 lines.append(line) 911 line = handle.readline() 912 return _as_bytes("").join(lines)
913
914 -class TabRandomAccess(SeqFileRandomAccess):
915 """Random access to a simple tabbed file."""
916 - def __iter__(self):
917 handle = self._handle 918 handle.seek(0) 919 start_offset = handle.tell() 920 tab_char = _as_bytes("\t") 921 while True: 922 line = handle.readline() 923 if not line : break #End of file 924 try: 925 key = line.split(tab_char)[0] 926 except ValueError, err: 927 if not line.strip(): 928 #Ignore blank lines 929 start_offset = handle.tell() 930 continue 931 else: 932 raise err 933 else: 934 end_offset = handle.tell() 935 yield _bytes_to_string(key), start_offset, end_offset - start_offset 936 start_offset = end_offset
937
938 - def get_raw(self, offset):
939 """Like the get method, but returns the record as a raw string.""" 940 handle = self._handle 941 handle.seek(offset) 942 return handle.readline()
943 944 945 ########################## 946 # Now the FASTQ indexers # 947 ########################## 948
949 -class FastqRandomAccess(SeqFileRandomAccess):
950 """Random access to a FASTQ file (any supported variant). 951 952 With FASTQ the records all start with a "@" line, but so can quality lines. 953 Note this will cope with line-wrapped FASTQ files. 954 """
955 - def __iter__(self):
956 handle = self._handle 957 handle.seek(0) 958 id = None 959 start_offset = handle.tell() 960 line = handle.readline() 961 if not line: 962 #Empty file! 963 return 964 at_char = _as_bytes("@") 965 plus_char = _as_bytes("+") 966 if line[0:1] != at_char: 967 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 968 while line: 969 #assert line[0]=="@" 970 #This record seems OK (so far) 971 id = line[1:].rstrip().split(None, 1)[0] 972 #Find the seq line(s) 973 seq_len = 0 974 while line: 975 line = handle.readline() 976 if line.startswith(plus_char) : break 977 seq_len += len(line.strip()) 978 if not line: 979 raise ValueError("Premature end of file in seq section") 980 #assert line[0]=="+" 981 #Find the qual line(s) 982 qual_len = 0 983 while line: 984 if seq_len == qual_len: 985 #Should be end of record... 986 line = handle.readline() 987 if line and line[0:1] != at_char: 988 ValueError("Problem with line %s" % repr(line)) 989 break 990 else: 991 line = handle.readline() 992 qual_len += len(line.strip()) 993 if seq_len != qual_len: 994 raise ValueError("Problem with quality section") 995 end_offset = handle.tell() - len(line) 996 yield _bytes_to_string(id), start_offset, end_offset - start_offset 997 start_offset = end_offset
998 #print "EOF" 999
1000 - def get_raw(self, offset):
1001 """Similar to the get method, but returns the record as a raw string.""" 1002 #TODO - Refactor this and the __init__ method to reduce code duplication? 1003 handle = self._handle 1004 handle.seek(offset) 1005 line = handle.readline() 1006 data = line 1007 at_char = _as_bytes("@") 1008 plus_char = _as_bytes("+") 1009 if line[0:1] != at_char: 1010 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 1011 identifier = line[1:].rstrip().split(None, 1)[0] 1012 #Find the seq line(s) 1013 seq_len = 0 1014 while line: 1015 line = handle.readline() 1016 data += line 1017 if line.startswith(plus_char) : break 1018 seq_len += len(line.strip()) 1019 if not line: 1020 raise ValueError("Premature end of file in seq section") 1021 assert line[0:1] == plus_char 1022 #Find the qual line(s) 1023 qual_len = 0 1024 while line: 1025 if seq_len == qual_len: 1026 #Should be end of record... 1027 pos = handle.tell() 1028 line = handle.readline() 1029 if line and line[0:1] != at_char: 1030 ValueError("Problem with line %s" % repr(line)) 1031 break 1032 else: 1033 line = handle.readline() 1034 data += line 1035 qual_len += len(line.strip()) 1036 if seq_len != qual_len: 1037 raise ValueError("Problem with quality section") 1038 return data
1039 1040 1041 ############################################################################### 1042 1043 _FormatToRandomAccess = {"ace" : SequentialSeqFileRandomAccess, 1044 "embl" : EmblRandomAccess, 1045 "fasta" : SequentialSeqFileRandomAccess, 1046 "fastq" : FastqRandomAccess, #Class handles all three variants 1047 "fastq-sanger" : FastqRandomAccess, #alias of the above 1048 "fastq-solexa" : FastqRandomAccess, 1049 "fastq-illumina" : FastqRandomAccess, 1050 "genbank" : GenBankRandomAccess, 1051 "gb" : GenBankRandomAccess, #alias of the above 1052 "ig" : IntelliGeneticsRandomAccess, 1053 "imgt" : EmblRandomAccess, 1054 "phd" : SequentialSeqFileRandomAccess, 1055 "pir" : SequentialSeqFileRandomAccess, 1056 "sff" : SffRandomAccess, 1057 "sff-trim" : SffTrimedRandomAccess, 1058 "swiss" : SwissRandomAccess, 1059 "tab" : TabRandomAccess, 1060 "qual" : SequentialSeqFileRandomAccess, 1061 "uniprot-xml" : UniprotRandomAccess, 1062 } 1063