Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  # 
   7  # This code is NOT intended for direct use.  It provides a basic scanner 
   8  # (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
   9  # to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  10  # 
  11  # It is used by Bio.GenBank to parse GenBank files 
  12  # It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  13  # 
  14  # Feature Table Documentation: 
  15  # http://www.insdc.org/files/feature_table.html 
  16  # http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  17  # ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  18  # 
  19  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  20  # These are GenBank files that summarize the content of a project, and provide lists of 
  21  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  22  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  23  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  24  # http://is.gd/nNgk 
  25  # for more details of this format, and an example. 
  26  # Added by Ying Huang & Iddo Friedberg 
  27   
  28  import warnings 
  29  import os 
  30  import re 
  31  from Bio.Seq import Seq 
  32  from Bio.SeqRecord import SeqRecord 
  33  from Bio.Alphabet import generic_alphabet, generic_protein 
  34   
35 -class InsdcScanner(object):
36 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 37 38 The International Nucleotide Sequence Database Collaboration (INSDC) 39 between the DDBJ, EMBL, and GenBank. These organisations all use the 40 same "Feature Table" layout in their plain text flat file formats. 41 42 However, the header and sequence sections of an EMBL file are very 43 different in layout to those produced by GenBank/DDBJ.""" 44 45 #These constants get redefined with sensible values in the sub classes: 46 RECORD_START = "XXX" # "LOCUS " or "ID " 47 HEADER_WIDTH = 3 # 12 or 5 48 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 49 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 50 FEATURE_QUALIFIER_INDENT = 0 51 FEATURE_QUALIFIER_SPACER = "" 52 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 53
54 - def __init__(self, debug=0):
55 assert len(self.RECORD_START)==self.HEADER_WIDTH 56 for marker in self.SEQUENCE_HEADERS: 57 assert marker==marker.rstrip() 58 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 59 self.debug = debug 60 self.line = None
61
62 - def set_handle(self, handle):
63 self.handle = handle 64 self.line = ""
65
66 - def find_start(self):
67 """Read in lines until find the ID/LOCUS line, which is returned. 68 69 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 70 will we ignored.""" 71 while True: 72 if self.line: 73 line = self.line 74 self.line = "" 75 else: 76 line = self.handle.readline() 77 if not line: 78 if self.debug : print "End of file" 79 return None 80 if line[:self.HEADER_WIDTH]==self.RECORD_START: 81 if self.debug > 1: print "Found the start of a record:\n" + line 82 break 83 line = line.rstrip() 84 if line == "//": 85 if self.debug > 1: print "Skipping // marking end of last record" 86 elif line == "": 87 if self.debug > 1: print "Skipping blank line before record" 88 else: 89 #Ignore any header before the first ID/LOCUS line. 90 if self.debug > 1: 91 print "Skipping header line before record:\n" + line 92 self.line = line 93 return line
94
95 - def parse_header(self):
96 """Return list of strings making up the header 97 98 New line characters are removed. 99 100 Assumes you have just read in the ID/LOCUS line. 101 """ 102 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 103 "Not at start of record" 104 105 header_lines = [] 106 while True: 107 line = self.handle.readline() 108 if not line: 109 raise ValueError("Premature end of line during sequence data") 110 line = line.rstrip() 111 if line in self.FEATURE_START_MARKERS: 112 if self.debug : print "Found header table" 113 break 114 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 115 # if self.debug : print "Found header table (?)" 116 # break 117 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 118 if self.debug : print "Found start of sequence" 119 break 120 if line == "//": 121 raise ValueError("Premature end of sequence data marker '//' found") 122 header_lines.append(line) 123 self.line = line 124 return header_lines
125
126 - def parse_features(self, skip=False):
127 """Return list of tuples for the features (if present) 128 129 Each feature is returned as a tuple (key, location, qualifiers) 130 where key and location are strings (e.g. "CDS" and 131 "complement(join(490883..490885,1..879))") while qualifiers 132 is a list of two string tuples (feature qualifier keys and values). 133 134 Assumes you have already read to the start of the features table. 135 """ 136 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 137 if self.debug : print "Didn't find any feature table" 138 return [] 139 140 while self.line.rstrip() in self.FEATURE_START_MARKERS: 141 self.line = self.handle.readline() 142 143 features = [] 144 line = self.line 145 while True: 146 if not line: 147 raise ValueError("Premature end of line during features table") 148 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 149 if self.debug : print "Found start of sequence" 150 break 151 line = line.rstrip() 152 if line == "//": 153 raise ValueError("Premature end of features table, marker '//' found") 154 if line in self.FEATURE_END_MARKERS: 155 if self.debug : print "Found end of features" 156 line = self.handle.readline() 157 break 158 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 159 #This is an empty feature line between qualifiers. Empty 160 #feature lines within qualifiers are handled below (ignored). 161 line = self.handle.readline() 162 continue 163 164 if skip: 165 line = self.handle.readline() 166 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 167 line = self.handle.readline() 168 else: 169 #Build up a list of the lines making up this feature: 170 if line[self.FEATURE_QUALIFIER_INDENT]!=" " \ 171 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 172 #The feature table design enforces a length limit on the feature keys. 173 #Some third party files (e.g. IGMT's EMBL like files) solve this by 174 #over indenting the location and qualifiers. 175 feature_key, line = line[2:].strip().split(None,1) 176 feature_lines = [line] 177 warnings.warn("Overindented %s feature?" % feature_key) 178 else: 179 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 180 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 181 line = self.handle.readline() 182 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 183 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 184 #Use strip to remove any harmless trailing white space AND and leading 185 #white space (e.g. out of spec files with too much intentation) 186 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 187 line = self.handle.readline() 188 features.append(self.parse_feature(feature_key, feature_lines)) 189 self.line = line 190 return features
191
192 - def parse_feature(self, feature_key, lines):
193 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 194 195 For example given this GenBank feature: 196 197 CDS complement(join(490883..490885,1..879)) 198 /locus_tag="NEQ001" 199 /note="conserved hypothetical [Methanococcus jannaschii]; 200 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 201 localization signal; IPR002743: Protein of unknown 202 function DUF57" 203 /codon_start=1 204 /transl_table=11 205 /product="hypothetical protein" 206 /protein_id="NP_963295.1" 207 /db_xref="GI:41614797" 208 /db_xref="GeneID:2732620" 209 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 210 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 211 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 212 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 213 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 214 LNSMGFGFVNTKKNSAR" 215 216 Then should give input key="CDS" and the rest of the data as a list of strings 217 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 218 where the leading spaces and trailing newlines have been removed. 219 220 Returns tuple containing: (key as string, location string, qualifiers as list) 221 as follows for this example: 222 223 key = "CDS", string 224 location = "complement(join(490883..490885,1..879))", string 225 qualifiers = list of string tuples: 226 227 [('locus_tag', '"NEQ001"'), 228 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 229 ('codon_start', '1'), 230 ('transl_table', '11'), 231 ('product', '"hypothetical protein"'), 232 ('protein_id', '"NP_963295.1"'), 233 ('db_xref', '"GI:41614797"'), 234 ('db_xref', '"GeneID:2732620"'), 235 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 236 237 In the above example, the "note" and "translation" were edited for compactness, 238 and they would contain multiple new line characters (displayed above as \n) 239 240 If a qualifier is quoted (in this case, everything except codon_start and 241 transl_table) then the quotes are NOT removed. 242 243 Note that no whitespace is removed. 244 """ 245 #Skip any blank lines 246 iterator = iter(filter(None, lines)) 247 try: 248 line = iterator.next() 249 250 feature_location = line.strip() 251 while feature_location[-1:]==",": 252 #Multiline location, still more to come! 253 line = iterator.next() 254 feature_location += line.strip() 255 256 qualifiers=[] 257 258 for line in iterator: 259 if line[0]=="/": 260 #New qualifier 261 i = line.find("=") 262 key = line[1:i] #does not work if i==-1 263 value = line[i+1:] #we ignore 'value' if i==-1 264 if i==-1: 265 #Qualifier with no key, e.g. /pseudo 266 key = line[1:] 267 qualifiers.append((key,None)) 268 elif not value: 269 #ApE can output /note= 270 qualifiers.append((key,"")) 271 elif value[0]=='"': 272 #Quoted... 273 if value[-1]!='"' or value!='"': 274 #No closing quote on the first line... 275 while value[-1] != '"': 276 value += "\n" + iterator.next() 277 else: 278 #One single line (quoted) 279 assert value == '"' 280 if self.debug : print "Quoted line %s:%s" % (key, value) 281 #DO NOT remove the quotes... 282 qualifiers.append((key,value)) 283 else: 284 #Unquoted 285 #if debug : print "Unquoted line %s:%s" % (key,value) 286 qualifiers.append((key,value)) 287 else: 288 #Unquoted continuation 289 assert len(qualifiers) > 0 290 assert key==qualifiers[-1][0] 291 #if debug : print "Unquoted Cont %s:%s" % (key, line) 292 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 293 return (feature_key, feature_location, qualifiers) 294 except StopIteration: 295 #Bummer 296 raise ValueError("Problem with '%s' feature:\n%s" \ 297 % (feature_key, "\n".join(lines)))
298 319
320 - def _feed_first_line(self, consumer, line):
321 """Handle the LOCUS/ID line, passing data to the comsumer 322 323 This should be implemented by the EMBL / GenBank specific subclass 324 325 Used by the parse_records() and parse() methods. 326 """ 327 pass
328
329 - def _feed_header_lines(self, consumer, lines):
330 """Handle the header lines (list of strings), passing data to the comsumer 331 332 This should be implemented by the EMBL / GenBank specific subclass 333 334 Used by the parse_records() and parse() methods. 335 """ 336 pass
337 338
339 - def _feed_feature_table(self, consumer, feature_tuples):
340 """Handle the feature table (list of tuples), passing data to the comsumer 341 342 Used by the parse_records() and parse() methods. 343 """ 344 consumer.start_feature_table() 345 for feature_key, location_string, qualifiers in feature_tuples: 346 consumer.feature_key(feature_key) 347 consumer.location(location_string) 348 for q_key, q_value in qualifiers: 349 consumer.feature_qualifier_name([q_key]) 350 if q_value is not None: 351 consumer.feature_qualifier_description(q_value.replace("\n"," "))
352
353 - def _feed_misc_lines(self, consumer, lines):
354 """Handle any lines between features and sequence (list of strings), passing data to the consumer 355 356 This should be implemented by the EMBL / GenBank specific subclass 357 358 Used by the parse_records() and parse() methods. 359 """ 360 pass
361
362 - def feed(self, handle, consumer, do_features=True):
363 """Feed a set of data into the consumer. 364 365 This method is intended for use with the "old" code in Bio.GenBank 366 367 Arguments: 368 handle - A handle with the information to parse. 369 consumer - The consumer that should be informed of events. 370 do_features - Boolean, should the features be parsed? 371 Skipping the features can be much faster. 372 373 Return values: 374 true - Passed a record 375 false - Did not find a record 376 """ 377 #Should work with both EMBL and GenBank files provided the 378 #equivalent Bio.GenBank._FeatureConsumer methods are called... 379 self.set_handle(handle) 380 if not self.find_start(): 381 #Could not find (another) record 382 consumer.data=None 383 return False 384 385 #We use the above class methods to parse the file into a simplified format. 386 #The first line, header lines and any misc lines after the features will be 387 #dealt with by GenBank / EMBL specific derived classes. 388 389 #First line and header: 390 self._feed_first_line(consumer, self.line) 391 self._feed_header_lines(consumer, self.parse_header()) 392 393 #Features (common to both EMBL and GenBank): 394 if do_features: 395 self._feed_feature_table(consumer, self.parse_features(skip=False)) 396 else: 397 self.parse_features(skip=True) # ignore the data 398 399 #Footer and sequence 400 misc_lines, sequence_string = self.parse_footer() 401 self._feed_misc_lines(consumer, misc_lines) 402 403 consumer.sequence(sequence_string) 404 #Calls to consumer.base_number() do nothing anyway 405 consumer.record_end("//") 406 407 assert self.line == "//" 408 409 #And we are done 410 return True
411
412 - def parse(self, handle, do_features=True):
413 """Returns a SeqRecord (with SeqFeatures if do_features=True) 414 415 See also the method parse_records() for use on multi-record files. 416 """ 417 from Bio.GenBank import _FeatureConsumer 418 from Bio.GenBank.utils import FeatureValueCleaner 419 420 consumer = _FeatureConsumer(use_fuzziness = 1, 421 feature_cleaner = FeatureValueCleaner()) 422 423 if self.feed(handle, consumer, do_features): 424 return consumer.data 425 else: 426 return None
427 428
429 - def parse_records(self, handle, do_features=True):
430 """Returns a SeqRecord object iterator 431 432 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 433 434 The SeqRecord objects include SeqFeatures if do_features=True 435 436 This method is intended for use in Bio.SeqIO 437 """ 438 #This is a generator function 439 while True: 440 record = self.parse(handle, do_features) 441 if record is None : break 442 assert record.id is not None 443 assert record.name != "<unknown name>" 444 assert record.description != "<unknown description>" 445 yield record
446
447 - def parse_cds_features(self, handle, 448 alphabet=generic_protein, 449 tags2id=('protein_id','locus_tag','product')):
450 """Returns SeqRecord object iterator 451 452 Each CDS feature becomes a SeqRecord. 453 454 alphabet - Used for any sequence found in a translation field. 455 tags2id - Tupple of three strings, the feature keys to use 456 for the record id, name and description, 457 458 This method is intended for use in Bio.SeqIO 459 """ 460 self.set_handle(handle) 461 while self.find_start(): 462 #Got an EMBL or GenBank record... 463 self.parse_header() # ignore header lines! 464 feature_tuples = self.parse_features() 465 #self.parse_footer() # ignore footer lines! 466 while True: 467 line = self.handle.readline() 468 if not line : break 469 if line[:2]=="//" : break 470 self.line = line.rstrip() 471 472 #Now go though those features... 473 for key, location_string, qualifiers in feature_tuples: 474 if key=="CDS": 475 #Create SeqRecord 476 #================ 477 #SeqRecord objects cannot be created with annotations, they 478 #must be added afterwards. So create an empty record and 479 #then populate it: 480 record = SeqRecord(seq=None) 481 annotations = record.annotations 482 483 #Should we add a location object to the annotations? 484 #I *think* that only makes sense for SeqFeatures with their 485 #sub features... 486 annotations['raw_location'] = location_string.replace(' ','') 487 488 for (qualifier_name, qualifier_data) in qualifiers: 489 if qualifier_data is not None \ 490 and qualifier_data[0]=='"' and qualifier_data[-1]=='"': 491 #Remove quotes 492 qualifier_data = qualifier_data[1:-1] 493 #Append the data to the annotation qualifier... 494 if qualifier_name == "translation": 495 assert record.seq is None, "Multiple translations!" 496 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 497 elif qualifier_name == "db_xref": 498 #its a list, possibly empty. Its safe to extend 499 record.dbxrefs.append(qualifier_data) 500 else: 501 if qualifier_data is not None: 502 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 503 try: 504 annotations[qualifier_name] += " " + qualifier_data 505 except KeyError: 506 #Not an addition to existing data, its the first bit 507 annotations[qualifier_name]= qualifier_data 508 509 #Fill in the ID, Name, Description 510 #================================= 511 try: 512 record.id = annotations[tags2id[0]] 513 except KeyError: 514 pass 515 try: 516 record.name = annotations[tags2id[1]] 517 except KeyError: 518 pass 519 try: 520 record.description = annotations[tags2id[2]] 521 except KeyError: 522 pass 523 524 yield record
525 526
527 -class EmblScanner(InsdcScanner):
528 """For extracting chunks of information in EMBL files""" 529 530 RECORD_START = "ID " 531 HEADER_WIDTH = 5 532 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 533 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 534 FEATURE_QUALIFIER_INDENT = 21 535 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 536 SEQUENCE_HEADERS=["SQ", "CO"] #Remove trailing spaces 537 572
573 - def _feed_first_line(self, consumer, line):
574 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 575 if line[self.HEADER_WIDTH:].count(";") == 6: 576 #Looks like the semi colon separated style introduced in 2006 577 self._feed_first_line_new(consumer, line) 578 elif line[self.HEADER_WIDTH:].count(";") == 3: 579 #Looks like the pre 2006 style 580 self._feed_first_line_old(consumer, line) 581 else: 582 raise ValueError('Did not recognise the ID line layout:\n' + line)
583
584 - def _feed_first_line_old(self, consumer, line):
585 #Expects an ID line in the style before 2006, e.g. 586 #ID SC10H5 standard; DNA; PRO; 4870 BP. 587 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 588 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 589 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 590 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 591 fields = [entry.strip() for entry in fields] 592 """ 593 The tokens represent: 594 0. Primary accession number 595 (space sep) 596 1. ??? (e.g. standard) 597 (semi-colon) 598 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 599 3. Taxonomic division (e.g. 'PRO') 600 4. Sequence length (e.g. '4639675 BP.') 601 """ 602 consumer.locus(fields[0]) #Should we also call the accession consumer? 603 consumer.residue_type(fields[2]) 604 consumer.data_file_division(fields[3]) 605 self._feed_seq_length(consumer, fields[4])
606
607 - def _feed_first_line_new(self, consumer, line):
608 #Expects an ID line in the style introduced in 2006, e.g. 609 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 610 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 611 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 612 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 613 assert len(fields) == 7 614 """ 615 The tokens represent: 616 0. Primary accession number 617 1. Sequence version number 618 2. Topology: 'circular' or 'linear' 619 3. Molecule type (e.g. 'genomic DNA') 620 4. Data class (e.g. 'STD') 621 5. Taxonomic division (e.g. 'PRO') 622 6. Sequence length (e.g. '4639675 BP.') 623 """ 624 625 consumer.locus(fields[0]) 626 627 #Call the accession consumer now, to make sure we record 628 #something as the record.id, in case there is no AC line 629 consumer.accession(fields[0]) 630 631 #TODO - How to deal with the version field? At the moment the consumer 632 #will try and use this for the ID which isn't ideal for EMBL files. 633 version_parts = fields[1].split() 634 if len(version_parts)==2 \ 635 and version_parts[0]=="SV" \ 636 and version_parts[1].isdigit(): 637 consumer.version_suffix(version_parts[1]) 638 639 #Based on how the old GenBank parser worked, merge these two: 640 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 641 642 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 643 644 consumer.data_file_division(fields[5]) 645 646 self._feed_seq_length(consumer, fields[6])
647
648 - def _feed_seq_length(self, consumer, text):
649 length_parts = text.split() 650 assert len(length_parts) == 2 651 assert length_parts[1].upper() in ["BP", "BP.", "AA."] 652 consumer.size(length_parts[0])
653
654 - def _feed_header_lines(self, consumer, lines):
655 EMBL_INDENT = self.HEADER_WIDTH 656 EMBL_SPACER = " " * EMBL_INDENT 657 consumer_dict = { 658 'AC' : 'accession', 659 'SV' : 'version', # SV line removed in June 2006, now part of ID line 660 'DE' : 'definition', 661 #'RN' : 'reference_num', 662 #'RC' : reference comment... TODO 663 #'RP' : 'reference_bases', 664 #'RX' : reference cross reference... DOI or Pubmed 665 'RG' : 'consrtm', #optional consortium 666 #'RA' : 'authors', 667 #'RT' : 'title', 668 'RL' : 'journal', 669 'OS' : 'organism', 670 'OC' : 'taxonomy', 671 #'DR' : data reference 672 'CC' : 'comment', 673 #'XX' : splitter 674 } 675 #We have to handle the following specially: 676 #RX (depending on reference type...) 677 for line in lines: 678 line_type = line[:EMBL_INDENT].strip() 679 data = line[EMBL_INDENT:].strip() 680 if line_type == 'XX': 681 pass 682 elif line_type == 'RN': 683 # Reformat reference numbers for the GenBank based consumer 684 # e.g. '[1]' becomes '1' 685 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 686 consumer.reference_num(data) 687 elif line_type == 'RP': 688 # Reformat reference numbers for the GenBank based consumer 689 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 690 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 691 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")] 692 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 693 elif line_type == 'RT': 694 #Remove the enclosing quotes and trailing semi colon. 695 #Note the title can be split over multiple lines. 696 if data.startswith('"'): 697 data = data[1:] 698 if data.endswith('";'): 699 data = data[:-2] 700 consumer.title(data) 701 elif line_type == 'RX': 702 # EMBL support three reference types at the moment: 703 # - PUBMED PUBMED bibliographic database (NLM) 704 # - DOI Digital Object Identifier (International DOI Foundation) 705 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 706 # of Agriculture (USDA) 707 # 708 # Format: 709 # RX resource_identifier; identifier. 710 # 711 # e.g. 712 # RX DOI; 10.1016/0024-3205(83)90010-3. 713 # RX PUBMED; 264242. 714 # 715 # Currently our reference object only supports PUBMED and MEDLINE 716 # (as these were in GenBank files?). 717 key, value = data.split(";",1) 718 if value.endswith(".") : value = value[:-1] 719 value = value.strip() 720 if key == "PUBMED": 721 consumer.pubmed_id(value) 722 #TODO - Handle other reference types (here and in BioSQL bindings) 723 elif line_type == 'CC': 724 # Have to pass a list of strings for this one (not just a string) 725 consumer.comment([data]) 726 elif line_type == 'DR': 727 # Database Cross-reference, format: 728 # DR database_identifier; primary_identifier; secondary_identifier. 729 # 730 # e.g. 731 # DR MGI; 98599; Tcrb-V4. 732 # 733 # TODO - How should we store any secondary identifier? 734 parts = data.rstrip(".").split(";") 735 #Turn it into "database_identifier:primary_identifier" to 736 #mimic the GenBank parser. e.g. "MGI:98599" 737 consumer.dblink("%s:%s" % (parts[0].strip(), 738 parts[1].strip())) 739 elif line_type == 'RA': 740 # Remove trailing ; at end of authors list 741 consumer.authors(data.rstrip(";")) 742 elif line_type == 'PR': 743 # Remove trailing ; at end of the project reference 744 # In GenBank files this corresponds to the old PROJECT 745 # line which is being replaced with the DBLINK line. 746 consumer.project(data.rstrip(";")) 747 elif line_type in consumer_dict: 748 #Its a semi-automatic entry! 749 getattr(consumer, consumer_dict[line_type])(data) 750 else: 751 if self.debug: 752 print "Ignoring EMBL header line:\n%s" % line
753
754 - def _feed_misc_lines(self, consumer, lines):
755 #TODO - Should we do something with the information on the SQ line(s)? 756 lines.append("") 757 line_iter = iter(lines) 758 try: 759 for line in line_iter: 760 if line.startswith("CO "): 761 line = line[5:].strip() 762 contig_location = line 763 while True: 764 line = line_iter.next() 765 if not line: 766 break 767 elif line.startswith("CO "): 768 #Don't need to preseve the whitespace here. 769 contig_location += line[5:].strip() 770 else: 771 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 772 consumer.contig_location(contig_location) 773 return 774 except StopIteration: 775 raise ValueError("Problem in misc lines before sequence")
776 777
778 -class _ImgtScanner(EmblScanner):
779 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 780 781 IMGT files are like EMBL files but in order to allow longer feature types 782 the features should be indented by 25 characters not 21 characters. In 783 practice the IMGT flat files tend to use either 21 or 25 characters, so we 784 must cope with both. 785 786 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 787 """ 788 789 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 790 "FH Key Location/Qualifiers (from EMBL)", 791 "FH Key Location/Qualifiers", 792 "FH"] 793
794 - def parse_features(self, skip=False):
795 """Return list of tuples for the features (if present) 796 797 Each feature is returned as a tuple (key, location, qualifiers) 798 where key and location are strings (e.g. "CDS" and 799 "complement(join(490883..490885,1..879))") while qualifiers 800 is a list of two string tuples (feature qualifier keys and values). 801 802 Assumes you have already read to the start of the features table. 803 """ 804 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 805 if self.debug : print "Didn't find any feature table" 806 return [] 807 808 while self.line.rstrip() in self.FEATURE_START_MARKERS: 809 self.line = self.handle.readline() 810 811 bad_position_re = re.compile(r'([0-9]+)>{1}') 812 813 features = [] 814 line = self.line 815 while True: 816 if not line: 817 raise ValueError("Premature end of line during features table") 818 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 819 if self.debug : print "Found start of sequence" 820 break 821 line = line.rstrip() 822 if line == "//": 823 raise ValueError("Premature end of features table, marker '//' found") 824 if line in self.FEATURE_END_MARKERS: 825 if self.debug : print "Found end of features" 826 line = self.handle.readline() 827 break 828 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 829 #This is an empty feature line between qualifiers. Empty 830 #feature lines within qualifiers are handled below (ignored). 831 line = self.handle.readline() 832 continue 833 834 if skip: 835 line = self.handle.readline() 836 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 837 line = self.handle.readline() 838 else: 839 assert line[:2] == "FT" 840 try: 841 feature_key, location_start = line[2:].strip().split() 842 except ValueError: 843 #e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 844 #Assume indent of 25 as per IMGT spec, with the location 845 #start in column 26 (one-based). 846 feature_key = line[2:25].strip() 847 location_start = line[25:].strip() 848 feature_lines = [location_start] 849 line = self.handle.readline() 850 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 851 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 852 #Use strip to remove any harmless trailing white space AND and leading 853 #white space (copes with 21 or 26 indents and orther variants) 854 assert line[:2] == "FT" 855 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 856 line = self.handle.readline() 857 feature_key, location, qualifiers = \ 858 self.parse_feature(feature_key, feature_lines) 859 #Try to handle known problems with IMGT locations here: 860 if ">" in location: 861 #Nasty hack for common IMGT bug, should be >123 not 123> 862 #in a location string. At least here the meaning is clear, 863 #and since it is so common I don't want to issue a warning 864 #warnings.warn("Feature location %s is invalid, " 865 # "moving greater than sign before position" 866 # % location) 867 location = bad_position_re.sub(r'>\1',location) 868 features.append((feature_key, location, qualifiers)) 869 self.line = line 870 return features
871
872 -class GenBankScanner(InsdcScanner):
873 """For extracting chunks of information in GenBank files""" 874 875 RECORD_START = "LOCUS " 876 HEADER_WIDTH = 12 877 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 878 FEATURE_END_MARKERS = [] 879 FEATURE_QUALIFIER_INDENT = 21 880 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 881 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 882 926
927 - def _feed_first_line(self, consumer, line):
928 """Scan over and parse GenBank LOCUS line (PRIVATE). 929 930 This must cope with several variants, primarily the old and new column 931 based standards from GenBank. Additionally EnsEMBL produces GenBank 932 files where the LOCUS line is space separated rather that following 933 the column based layout. 934 935 We also try to cope with GenBank like files with partial LOCUS lines. 936 """ 937 ##################################### 938 # LOCUS line # 939 ##################################### 940 GENBANK_INDENT = self.HEADER_WIDTH 941 GENBANK_SPACER = " "*GENBANK_INDENT 942 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 943 'LOCUS line does not start correctly:\n' + line 944 945 #Have to break up the locus line, and handle the different bits of it. 946 #There are at least two different versions of the locus line... 947 if line[29:33] in [' bp ', ' aa ',' rc '] and line[55:62] == ' ': 948 #Old... note we insist on the 55:62 being empty to avoid trying 949 #to parse space separated LOCUS lines from Ensembl etc, see below. 950 # 951 # Positions Contents 952 # --------- -------- 953 # 00:06 LOCUS 954 # 06:12 spaces 955 # 12:?? Locus name 956 # ??:?? space 957 # ??:29 Length of sequence, right-justified 958 # 29:33 space, bp, space 959 # 33:41 strand type 960 # 41:42 space 961 # 42:51 Blank (implies linear), linear or circular 962 # 51:52 space 963 # 52:55 The division code (e.g. BCT, VRL, INV) 964 # 55:62 space 965 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 966 # 967 #assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 968 # 'LOCUS line does not contain size units at expected position:\n' + line 969 assert line[41:42] == ' ', \ 970 'LOCUS line does not contain space at position 42:\n' + line 971 assert line[42:51].strip() in ['','linear','circular'], \ 972 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 973 assert line[51:52] == ' ', \ 974 'LOCUS line does not contain space at position 52:\n' + line 975 #assert line[55:62] == ' ', \ 976 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 977 if line[62:73].strip(): 978 assert line[64:65] == '-', \ 979 'LOCUS line does not contain - at position 65 in date:\n' + line 980 assert line[68:69] == '-', \ 981 'LOCUS line does not contain - at position 69 in date:\n' + line 982 983 name_and_length_str = line[GENBANK_INDENT:29] 984 while name_and_length_str.find(' ')!=-1: 985 name_and_length_str = name_and_length_str.replace(' ',' ') 986 name_and_length = name_and_length_str.split(' ') 987 assert len(name_and_length)<=2, \ 988 'Cannot parse the name and length in the LOCUS line:\n' + line 989 assert len(name_and_length)!=1, \ 990 'Name and length collide in the LOCUS line:\n' + line 991 #Should be possible to split them based on position, if 992 #a clear definition of the standard exists THAT AGREES with 993 #existing files. 994 consumer.locus(name_and_length[0]) 995 consumer.size(name_and_length[1]) 996 #consumer.residue_type(line[33:41].strip()) 997 998 if line[33:51].strip() == "" and line[29:33] == ' aa ': 999 #Amino acids -> protein (even if there is no residue type given) 1000 #We want to use a protein alphabet in this case, rather than a 1001 #generic one. Not sure if this is the best way to achieve this, 1002 #but it works because the scanner checks for this: 1003 consumer.residue_type("PROTEIN") 1004 else: 1005 consumer.residue_type(line[33:51].strip()) 1006 1007 consumer.data_file_division(line[52:55]) 1008 if line[62:73].strip(): 1009 consumer.date(line[62:73]) 1010 elif line[40:44] in [' bp ', ' aa ',' rc '] \ 1011 and line[54:64].strip() in ['','linear','circular']: 1012 #New... linear/circular/big blank test should avoid EnsEMBL style 1013 #LOCUS line being treated like a proper column based LOCUS line. 1014 # 1015 # Positions Contents 1016 # --------- -------- 1017 # 00:06 LOCUS 1018 # 06:12 spaces 1019 # 12:?? Locus name 1020 # ??:?? space 1021 # ??:40 Length of sequence, right-justified 1022 # 40:44 space, bp, space 1023 # 44:47 Blank, ss-, ds-, ms- 1024 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1025 # 54:55 space 1026 # 55:63 Blank (implies linear), linear or circular 1027 # 63:64 space 1028 # 64:67 The division code (e.g. BCT, VRL, INV) 1029 # 67:68 space 1030 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1031 # 1032 assert line[40:44] in [' bp ', ' aa ',' rc '] , \ 1033 'LOCUS line does not contain size units at expected position:\n' + line 1034 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1035 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1036 assert line[47:54].strip() == "" \ 1037 or line[47:54].strip().find('DNA') != -1 \ 1038 or line[47:54].strip().find('RNA') != -1, \ 1039 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1040 assert line[54:55] == ' ', \ 1041 'LOCUS line does not contain space at position 55:\n' + line 1042 assert line[55:63].strip() in ['','linear','circular'], \ 1043 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1044 assert line[63:64] == ' ', \ 1045 'LOCUS line does not contain space at position 64:\n' + line 1046 assert line[67:68] == ' ', \ 1047 'LOCUS line does not contain space at position 68:\n' + line 1048 if line[68:79].strip(): 1049 assert line[70:71] == '-', \ 1050 'LOCUS line does not contain - at position 71 in date:\n' + line 1051 assert line[74:75] == '-', \ 1052 'LOCUS line does not contain - at position 75 in date:\n' + line 1053 1054 name_and_length_str = line[GENBANK_INDENT:40] 1055 while name_and_length_str.find(' ')!=-1: 1056 name_and_length_str = name_and_length_str.replace(' ',' ') 1057 name_and_length = name_and_length_str.split(' ') 1058 assert len(name_and_length)<=2, \ 1059 'Cannot parse the name and length in the LOCUS line:\n' + line 1060 assert len(name_and_length)!=1, \ 1061 'Name and length collide in the LOCUS line:\n' + line 1062 #Should be possible to split them based on position, if 1063 #a clear definition of the stand exists THAT AGREES with 1064 #existing files. 1065 consumer.locus(name_and_length[0]) 1066 consumer.size(name_and_length[1]) 1067 1068 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1069 #Amino acids -> protein (even if there is no residue type given) 1070 #We want to use a protein alphabet in this case, rather than a 1071 #generic one. Not sure if this is the best way to achieve this, 1072 #but it works because the scanner checks for this: 1073 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1074 else: 1075 consumer.residue_type(line[44:63].strip()) 1076 1077 consumer.data_file_division(line[64:67]) 1078 if line[68:79].strip(): 1079 consumer.date(line[68:79]) 1080 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 1081 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1082 # 1083 #e.g. 1084 # 1085 # "LOCUS U00096" 1086 # 1087 #rather than: 1088 # 1089 # "LOCUS U00096 4639675 bp DNA circular BCT" 1090 # 1091 # Positions Contents 1092 # --------- -------- 1093 # 00:06 LOCUS 1094 # 06:12 spaces 1095 # 12:?? Locus name 1096 if line[GENBANK_INDENT:].strip() != "": 1097 consumer.locus(line[GENBANK_INDENT:].strip()) 1098 else: 1099 #Must just have just "LOCUS ", is this even legitimate? 1100 #We should be able to continue parsing... we need real world testcases! 1101 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line) 1102 elif len(line.split())==7 and line.split()[3] in ["aa","bp"]: 1103 #Cope with EnsEMBL genbank files which use space separation rather 1104 #than the expected column based layout. e.g. 1105 #LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1106 #LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1107 #LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1108 #LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1109 #Notice that the 'bp' can occur in the position expected by either 1110 #the old or the new fixed column standards (parsed above). 1111 splitline = line.split() 1112 consumer.locus(splitline[1]) 1113 consumer.size(splitline[2]) 1114 consumer.residue_type(splitline[4]) 1115 consumer.data_file_division(splitline[5]) 1116 consumer.date(splitline[6]) 1117 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]: 1118 #Cope with EMBOSS seqret output where it seems the locus id can cause 1119 #the other fields to overflow. We just IGNORE the other fields! 1120 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line) 1121 consumer.locus(line.split()[1]) 1122 consumer.size(line.split()[2]) 1123 elif len(line.split())>=4 and line.split()[-1] in ["aa","bp"]: 1124 #Cope with psuedo-GenBank files like this: 1125 # "LOCUS RNA5 complete 1718 bp" 1126 #Treat everything between LOCUS and the size as the identifier. 1127 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line) 1128 consumer.locus(line[5:].rsplit(None,2)[0].strip()) 1129 consumer.size(line.split()[-2]) 1130 else: 1131 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1132 1133
1134 - def _feed_header_lines(self, consumer, lines):
1135 #Following dictionary maps GenBank lines to the associated 1136 #consumer methods - the special cases like LOCUS where one 1137 #genbank line triggers several consumer calls have to be 1138 #handled individually. 1139 GENBANK_INDENT = self.HEADER_WIDTH 1140 GENBANK_SPACER = " "*GENBANK_INDENT 1141 consumer_dict = { 1142 'DEFINITION' : 'definition', 1143 'ACCESSION' : 'accession', 1144 'NID' : 'nid', 1145 'PID' : 'pid', 1146 'DBSOURCE' : 'db_source', 1147 'KEYWORDS' : 'keywords', 1148 'SEGMENT' : 'segment', 1149 'SOURCE' : 'source', 1150 'AUTHORS' : 'authors', 1151 'CONSRTM' : 'consrtm', 1152 'PROJECT' : 'project', 1153 'DBLINK' : 'dblink', 1154 'TITLE' : 'title', 1155 'JOURNAL' : 'journal', 1156 'MEDLINE' : 'medline_id', 1157 'PUBMED' : 'pubmed_id', 1158 'REMARK' : 'remark'} 1159 #We have to handle the following specially: 1160 #ORIGIN (locus, size, residue_type, data_file_division and date) 1161 #COMMENT (comment) 1162 #VERSION (version and gi) 1163 #REFERENCE (eference_num and reference_bases) 1164 #ORGANISM (organism and taxonomy) 1165 lines = filter(None,lines) 1166 lines.append("") #helps avoid getting StopIteration all the time 1167 line_iter = iter(lines) 1168 try: 1169 line = line_iter.next() 1170 while True: 1171 if not line : break 1172 line_type = line[:GENBANK_INDENT].strip() 1173 data = line[GENBANK_INDENT:].strip() 1174 1175 if line_type == 'VERSION': 1176 #Need to call consumer.version(), and maybe also consumer.gi() as well. 1177 #e.g. 1178 # VERSION AC007323.5 GI:6587720 1179 while data.find(' ')!=-1: 1180 data = data.replace(' ',' ') 1181 if data.find(' GI:')==-1: 1182 consumer.version(data) 1183 else: 1184 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 1185 consumer.version(data.split(' GI:')[0]) 1186 consumer.gi(data.split(' GI:')[1]) 1187 #Read in the next line! 1188 line = line_iter.next() 1189 elif line_type == 'REFERENCE': 1190 if self.debug >1 : print "Found reference [" + data + "]" 1191 #Need to call consumer.reference_num() and consumer.reference_bases() 1192 #e.g. 1193 # REFERENCE 1 (bases 1 to 86436) 1194 # 1195 #Note that this can be multiline, see Bug 1968, e.g. 1196 # 1197 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1198 # 28259) 1199 # 1200 #For such cases we will call the consumer once only. 1201 data = data.strip() 1202 1203 #Read in the next line, and see if its more of the reference: 1204 while True: 1205 line = line_iter.next() 1206 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1207 #Add this continuation to the data string 1208 data += " " + line[GENBANK_INDENT:] 1209 if self.debug >1 : print "Extended reference text [" + data + "]" 1210 else: 1211 #End of the reference, leave this text in the variable "line" 1212 break 1213 1214 #We now have all the reference line(s) stored in a string, data, 1215 #which we pass to the consumer 1216 while data.find(' ')!=-1: 1217 data = data.replace(' ',' ') 1218 if data.find(' ')==-1: 1219 if self.debug >2 : print 'Reference number \"' + data + '\"' 1220 consumer.reference_num(data) 1221 else: 1222 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 1223 consumer.reference_num(data[:data.find(' ')]) 1224 consumer.reference_bases(data[data.find(' ')+1:]) 1225 elif line_type == 'ORGANISM': 1226 #Typically the first line is the organism, and subsequent lines 1227 #are the taxonomy lineage. However, given longer and longer 1228 #species names (as more and more strains and sub strains get 1229 #sequenced) the oragnism name can now get wrapped onto multiple 1230 #lines. The NCBI say we have to recognise the lineage line by 1231 #the presense of semi-colon delimited entries. In the long term, 1232 #they are considering adding a new keyword (e.g. LINEAGE). 1233 #See Bug 2591 for details. 1234 organism_data = data 1235 lineage_data = "" 1236 while True: 1237 line = line_iter.next() 1238 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1239 if lineage_data or ";" in line: 1240 lineage_data += " " + line[GENBANK_INDENT:] 1241 else: 1242 organism_data += " " + line[GENBANK_INDENT:].strip() 1243 else: 1244 #End of organism and taxonomy 1245 break 1246 consumer.organism(organism_data) 1247 if lineage_data.strip() == "" and self.debug > 1: 1248 print "Taxonomy line(s) missing or blank" 1249 consumer.taxonomy(lineage_data.strip()) 1250 del organism_data, lineage_data 1251 elif line_type == 'COMMENT': 1252 if self.debug > 1 : print "Found comment" 1253 #This can be multiline, and should call consumer.comment() once 1254 #with a list where each entry is a line. 1255 comment_list=[] 1256 comment_list.append(data) 1257 while True: 1258 line = line_iter.next() 1259 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1260 data = line[GENBANK_INDENT:] 1261 comment_list.append(data) 1262 if self.debug > 2 : print "Comment continuation [" + data + "]" 1263 else: 1264 #End of the comment 1265 break 1266 consumer.comment(comment_list) 1267 del comment_list 1268 elif line_type in consumer_dict: 1269 #Its a semi-automatic entry! 1270 #Now, this may be a multi line entry... 1271 while True: 1272 line = line_iter.next() 1273 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1274 data += ' ' + line[GENBANK_INDENT:] 1275 else: 1276 #We now have all the data for this entry: 1277 getattr(consumer, consumer_dict[line_type])(data) 1278 #End of continuation - return to top of loop! 1279 break 1280 else: 1281 if self.debug: 1282 print "Ignoring GenBank header line:\n" % line 1283 #Read in next line 1284 line = line_iter.next() 1285 except StopIteration: 1286 raise ValueError("Problem in header")
1287
1288 - def _feed_misc_lines(self, consumer, lines):
1289 #Deals with a few misc lines between the features and the sequence 1290 GENBANK_INDENT = self.HEADER_WIDTH 1291 GENBANK_SPACER = " "*GENBANK_INDENT 1292 lines.append("") 1293 line_iter = iter(lines) 1294 try: 1295 for line in line_iter: 1296 if line.find('BASE COUNT')==0: 1297 line = line[10:].strip() 1298 if line: 1299 if self.debug : print "base_count = " + line 1300 consumer.base_count(line) 1301 if line.find("ORIGIN")==0: 1302 line = line[6:].strip() 1303 if line: 1304 if self.debug : print "origin_name = " + line 1305 consumer.origin_name(line) 1306 if line.find("WGS ")==0 : 1307 line = line[3:].strip() 1308 consumer.wgs(line) 1309 if line.find("WGS_SCAFLD")==0 : 1310 line = line[10:].strip() 1311 consumer.add_wgs_scafld(line) 1312 if line.find("CONTIG")==0: 1313 line = line[6:].strip() 1314 contig_location = line 1315 while True: 1316 line = line_iter.next() 1317 if not line: 1318 break 1319 elif line[:GENBANK_INDENT]==GENBANK_SPACER: 1320 #Don't need to preseve the whitespace here. 1321 contig_location += line[GENBANK_INDENT:].rstrip() 1322 else: 1323 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1324 consumer.contig_location(contig_location) 1325 return 1326 except StopIteration: 1327 raise ValueError("Problem in misc lines before sequence")
1328 1329 if __name__ == "__main__": 1330 from StringIO import StringIO 1331 1332 gbk_example = \ 1333 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1334 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1335 (AXL2) and Rev7p (REV7) genes, complete cds. 1336 ACCESSION U49845 1337 VERSION U49845.1 GI:1293613 1338 KEYWORDS . 1339 SOURCE Saccharomyces cerevisiae (baker's yeast) 1340 ORGANISM Saccharomyces cerevisiae 1341 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1342 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1343 REFERENCE 1 (bases 1 to 5028) 1344 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1345 TITLE Cloning and sequence of REV7, a gene whose function is required for 1346 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1347 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1348 PUBMED 7871890 1349 REFERENCE 2 (bases 1 to 5028) 1350 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1351 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1352 plasma membrane glycoprotein 1353 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1354 PUBMED 8846915 1355 REFERENCE 3 (bases 1 to 5028) 1356 AUTHORS Roemer,T. 1357 TITLE Direct Submission 1358 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1359 Haven, CT, USA 1360 FEATURES Location/Qualifiers 1361 source 1..5028 1362 /organism="Saccharomyces cerevisiae" 1363 /db_xref="taxon:4932" 1364 /chromosome="IX" 1365 /map="9" 1366 CDS <1..206 1367 /codon_start=3 1368 /product="TCP1-beta" 1369 /protein_id="AAA98665.1" 1370 /db_xref="GI:1293614" 1371 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1372 AEVLLRVDNIIRARPRTANRQHM" 1373 gene 687..3158 1374 /gene="AXL2" 1375 CDS 687..3158 1376 /gene="AXL2" 1377 /note="plasma membrane glycoprotein" 1378 /codon_start=1 1379 /function="required for axial budding pattern of S. 1380 cerevisiae" 1381 /product="Axl2p" 1382 /protein_id="AAA98666.1" 1383 /db_xref="GI:1293615" 1384 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1385 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1386 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1387 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1388 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1389 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1390 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1391 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1392 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1393 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1394 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1395 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1396 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1397 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1398 VDFSNKSNVNVGQVKDIHGRIPEML" 1399 gene complement(3300..4037) 1400 /gene="REV7" 1401 CDS complement(3300..4037) 1402 /gene="REV7" 1403 /codon_start=1 1404 /product="Rev7p" 1405 /protein_id="AAA98667.1" 1406 /db_xref="GI:1293616" 1407 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1408 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1409 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1410 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1411 LISGDDKILNGVYSQYEEGESIFGSLF" 1412 ORIGIN 1413 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1414 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1415 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1416 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1417 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1418 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1419 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1420 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1421 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1422 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1423 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1424 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1425 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1426 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1427 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1428 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1429 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1430 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1431 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1432 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1433 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1434 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1435 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1436 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1437 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1438 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1439 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1440 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1441 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1442 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1443 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1444 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1445 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1446 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1447 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1448 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1449 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1450 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1451 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1452 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1453 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1454 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1455 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1456 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1457 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1458 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1459 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1460 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1461 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1462 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1463 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1464 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1465 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1466 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1467 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1468 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1469 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1470 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1471 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1472 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1473 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1474 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1475 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1476 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1477 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1478 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1479 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1480 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1481 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1482 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1483 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1484 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1485 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1486 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1487 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1488 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1489 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1490 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1491 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1492 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1493 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1494 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1495 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1496 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1497 //""" 1498 1499 # GenBank format protein (aka GenPept) file from: 1500 # http://www.molecularevolution.org/resources/fileformats/ 1501 gbk_example2 = \ 1502 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1503 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1504 ACCESSION AAD51968 1505 VERSION AAD51968.1 GI:5805369 1506 DBSOURCE locus AF171097 accession AF171097.1 1507 KEYWORDS . 1508 SOURCE Yersinia enterocolitica 1509 ORGANISM Yersinia enterocolitica 1510 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1511 Enterobacteriaceae; Yersinia. 1512 REFERENCE 1 (residues 1 to 143) 1513 AUTHORS Revell,P.A. and Miller,V.L. 1514 TITLE A chromosomally encoded regulator is required for expression of the 1515 Yersinia enterocolitica inv gene and for virulence 1516 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1517 MEDLINE 20138369 1518 PUBMED 10672189 1519 REFERENCE 2 (residues 1 to 143) 1520 AUTHORS Revell,P.A. and Miller,V.L. 1521 TITLE Direct Submission 1522 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1523 University School of Medicine, Campus Box 8230, 660 South Euclid, 1524 St. Louis, MO 63110, USA 1525 COMMENT Method: conceptual translation. 1526 FEATURES Location/Qualifiers 1527 source 1..143 1528 /organism="Yersinia enterocolitica" 1529 /mol_type="unassigned DNA" 1530 /strain="JB580v" 1531 /serotype="O:8" 1532 /db_xref="taxon:630" 1533 Protein 1..143 1534 /product="transcriptional regulator RovA" 1535 /name="regulates inv expression" 1536 CDS 1..143 1537 /gene="rovA" 1538 /coded_by="AF171097.1:380..811" 1539 /note="regulator of virulence" 1540 /transl_table=11 1541 ORIGIN 1542 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1543 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1544 121 deiellsgli dklerniiql qsk 1545 // 1546 """ 1547 1548 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1549 XX 1550 AC X56734; S46826; 1551 XX 1552 DT 12-SEP-1991 (Rel. 29, Created) 1553 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1554 XX 1555 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1556 XX 1557 KW beta-glucosidase. 1558 XX 1559 OS Trifolium repens (white clover) 1560 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1561 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1562 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1563 XX 1564 RN [5] 1565 RP 1-1859 1566 RX PUBMED; 1907511. 1567 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1568 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1569 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1570 RL Plant Mol. Biol. 17(2):209-219(1991). 1571 XX 1572 RN [6] 1573 RP 1-1859 1574 RA Hughes M.A.; 1575 RT ; 1576 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1577 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1578 RL Upon Tyne, NE2 4HH, UK 1579 XX 1580 FH Key Location/Qualifiers 1581 FH 1582 FT source 1..1859 1583 FT /organism="Trifolium repens" 1584 FT /mol_type="mRNA" 1585 FT /clone_lib="lambda gt10" 1586 FT /clone="TRE361" 1587 FT /tissue_type="leaves" 1588 FT /db_xref="taxon:3899" 1589 FT CDS 14..1495 1590 FT /product="beta-glucosidase" 1591 FT /EC_number="3.2.1.21" 1592 FT /note="non-cyanogenic" 1593 FT /db_xref="GOA:P26204" 1594 FT /db_xref="InterPro:IPR001360" 1595 FT /db_xref="InterPro:IPR013781" 1596 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1597 FT /protein_id="CAA40058.1" 1598 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1599 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1600 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1601 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1602 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1603 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1604 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1605 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1606 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1607 FT mRNA 1..1859 1608 FT /experiment="experimental evidence, no additional details 1609 FT recorded" 1610 XX 1611 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1612 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1613 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1614 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1615 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1616 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1617 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1618 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1619 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1620 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1621 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1622 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1623 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1624 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1625 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1626 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1627 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1628 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1629 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1630 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1631 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1632 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1633 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1634 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1635 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1636 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1637 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1638 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1639 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1640 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1641 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1642 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1643 // 1644 """ 1645 1646 print "GenBank CDS Iteration" 1647 print "=====================" 1648 1649 g = GenBankScanner() 1650 for record in g.parse_cds_features(StringIO(gbk_example)): 1651 print record 1652 1653 g = GenBankScanner() 1654 for record in g.parse_cds_features(StringIO(gbk_example2), 1655 tags2id=('gene','locus_tag','product')): 1656 print record 1657 1658 g = GenBankScanner() 1659 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1660 tags2id=('gene','locus_tag','product')): 1661 print record 1662 1663 print 1664 print "GenBank Iteration" 1665 print "=================" 1666 g = GenBankScanner() 1667 for record in g.parse_records(StringIO(gbk_example),do_features=False): 1668 print record.id, record.name, record.description 1669 print record.seq 1670 1671 g = GenBankScanner() 1672 for record in g.parse_records(StringIO(gbk_example),do_features=True): 1673 print record.id, record.name, record.description 1674 print record.seq 1675 1676 g = GenBankScanner() 1677 for record in g.parse_records(StringIO(gbk_example2),do_features=False): 1678 print record.id, record.name, record.description 1679 print record.seq 1680 1681 g = GenBankScanner() 1682 for record in g.parse_records(StringIO(gbk_example2),do_features=True): 1683 print record.id, record.name, record.description 1684 print record.seq 1685 1686 print 1687 print "EMBL CDS Iteration" 1688 print "==================" 1689 1690 e = EmblScanner() 1691 for record in e.parse_cds_features(StringIO(embl_example)): 1692 print record 1693 1694 print 1695 print "EMBL Iteration" 1696 print "==============" 1697 e = EmblScanner() 1698 for record in e.parse_records(StringIO(embl_example),do_features=True): 1699 print record.id, record.name, record.description 1700 print record.seq 1701