1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 import warnings
29 import os
30 import re
31 from Bio.Seq import Seq
32 from Bio.SeqRecord import SeqRecord
33 from Bio.Alphabet import generic_alphabet, generic_protein
34
36 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
37
38 The International Nucleotide Sequence Database Collaboration (INSDC)
39 between the DDBJ, EMBL, and GenBank. These organisations all use the
40 same "Feature Table" layout in their plain text flat file formats.
41
42 However, the header and sequence sections of an EMBL file are very
43 different in layout to those produced by GenBank/DDBJ."""
44
45
46 RECORD_START = "XXX"
47 HEADER_WIDTH = 3
48 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
49 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
50 FEATURE_QUALIFIER_INDENT = 0
51 FEATURE_QUALIFIER_SPACER = ""
52 SEQUENCE_HEADERS=["XXX"]
53
61
65
67 """Read in lines until find the ID/LOCUS line, which is returned.
68
69 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
70 will we ignored."""
71 while True:
72 if self.line:
73 line = self.line
74 self.line = ""
75 else:
76 line = self.handle.readline()
77 if not line:
78 if self.debug : print "End of file"
79 return None
80 if line[:self.HEADER_WIDTH]==self.RECORD_START:
81 if self.debug > 1: print "Found the start of a record:\n" + line
82 break
83 line = line.rstrip()
84 if line == "//":
85 if self.debug > 1: print "Skipping // marking end of last record"
86 elif line == "":
87 if self.debug > 1: print "Skipping blank line before record"
88 else:
89
90 if self.debug > 1:
91 print "Skipping header line before record:\n" + line
92 self.line = line
93 return line
94
96 """Return list of strings making up the header
97
98 New line characters are removed.
99
100 Assumes you have just read in the ID/LOCUS line.
101 """
102 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
103 "Not at start of record"
104
105 header_lines = []
106 while True:
107 line = self.handle.readline()
108 if not line:
109 raise ValueError("Premature end of line during sequence data")
110 line = line.rstrip()
111 if line in self.FEATURE_START_MARKERS:
112 if self.debug : print "Found header table"
113 break
114
115
116
117 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
118 if self.debug : print "Found start of sequence"
119 break
120 if line == "//":
121 raise ValueError("Premature end of sequence data marker '//' found")
122 header_lines.append(line)
123 self.line = line
124 return header_lines
125
191
193 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
194
195 For example given this GenBank feature:
196
197 CDS complement(join(490883..490885,1..879))
198 /locus_tag="NEQ001"
199 /note="conserved hypothetical [Methanococcus jannaschii];
200 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
201 localization signal; IPR002743: Protein of unknown
202 function DUF57"
203 /codon_start=1
204 /transl_table=11
205 /product="hypothetical protein"
206 /protein_id="NP_963295.1"
207 /db_xref="GI:41614797"
208 /db_xref="GeneID:2732620"
209 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
210 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
211 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
212 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
213 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
214 LNSMGFGFVNTKKNSAR"
215
216 Then should give input key="CDS" and the rest of the data as a list of strings
217 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
218 where the leading spaces and trailing newlines have been removed.
219
220 Returns tuple containing: (key as string, location string, qualifiers as list)
221 as follows for this example:
222
223 key = "CDS", string
224 location = "complement(join(490883..490885,1..879))", string
225 qualifiers = list of string tuples:
226
227 [('locus_tag', '"NEQ001"'),
228 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
229 ('codon_start', '1'),
230 ('transl_table', '11'),
231 ('product', '"hypothetical protein"'),
232 ('protein_id', '"NP_963295.1"'),
233 ('db_xref', '"GI:41614797"'),
234 ('db_xref', '"GeneID:2732620"'),
235 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
236
237 In the above example, the "note" and "translation" were edited for compactness,
238 and they would contain multiple new line characters (displayed above as \n)
239
240 If a qualifier is quoted (in this case, everything except codon_start and
241 transl_table) then the quotes are NOT removed.
242
243 Note that no whitespace is removed.
244 """
245
246 iterator = iter(filter(None, lines))
247 try:
248 line = iterator.next()
249
250 feature_location = line.strip()
251 while feature_location[-1:]==",":
252
253 line = iterator.next()
254 feature_location += line.strip()
255
256 qualifiers=[]
257
258 for line in iterator:
259 if line[0]=="/":
260
261 i = line.find("=")
262 key = line[1:i]
263 value = line[i+1:]
264 if i==-1:
265
266 key = line[1:]
267 qualifiers.append((key,None))
268 elif not value:
269
270 qualifiers.append((key,""))
271 elif value[0]=='"':
272
273 if value[-1]!='"' or value!='"':
274
275 while value[-1] != '"':
276 value += "\n" + iterator.next()
277 else:
278
279 assert value == '"'
280 if self.debug : print "Quoted line %s:%s" % (key, value)
281
282 qualifiers.append((key,value))
283 else:
284
285
286 qualifiers.append((key,value))
287 else:
288
289 assert len(qualifiers) > 0
290 assert key==qualifiers[-1][0]
291
292 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
293 return (feature_key, feature_location, qualifiers)
294 except StopIteration:
295
296 raise ValueError("Problem with '%s' feature:\n%s" \
297 % (feature_key, "\n".join(lines)))
298
319
321 """Handle the LOCUS/ID line, passing data to the comsumer
322
323 This should be implemented by the EMBL / GenBank specific subclass
324
325 Used by the parse_records() and parse() methods.
326 """
327 pass
328
330 """Handle the header lines (list of strings), passing data to the comsumer
331
332 This should be implemented by the EMBL / GenBank specific subclass
333
334 Used by the parse_records() and parse() methods.
335 """
336 pass
337
338
352
354 """Handle any lines between features and sequence (list of strings), passing data to the consumer
355
356 This should be implemented by the EMBL / GenBank specific subclass
357
358 Used by the parse_records() and parse() methods.
359 """
360 pass
361
362 - def feed(self, handle, consumer, do_features=True):
363 """Feed a set of data into the consumer.
364
365 This method is intended for use with the "old" code in Bio.GenBank
366
367 Arguments:
368 handle - A handle with the information to parse.
369 consumer - The consumer that should be informed of events.
370 do_features - Boolean, should the features be parsed?
371 Skipping the features can be much faster.
372
373 Return values:
374 true - Passed a record
375 false - Did not find a record
376 """
377
378
379 self.set_handle(handle)
380 if not self.find_start():
381
382 consumer.data=None
383 return False
384
385
386
387
388
389
390 self._feed_first_line(consumer, self.line)
391 self._feed_header_lines(consumer, self.parse_header())
392
393
394 if do_features:
395 self._feed_feature_table(consumer, self.parse_features(skip=False))
396 else:
397 self.parse_features(skip=True)
398
399
400 misc_lines, sequence_string = self.parse_footer()
401 self._feed_misc_lines(consumer, misc_lines)
402
403 consumer.sequence(sequence_string)
404
405 consumer.record_end("//")
406
407 assert self.line == "//"
408
409
410 return True
411
412 - def parse(self, handle, do_features=True):
427
428
430 """Returns a SeqRecord object iterator
431
432 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
433
434 The SeqRecord objects include SeqFeatures if do_features=True
435
436 This method is intended for use in Bio.SeqIO
437 """
438
439 while True:
440 record = self.parse(handle, do_features)
441 if record is None : break
442 assert record.id is not None
443 assert record.name != "<unknown name>"
444 assert record.description != "<unknown description>"
445 yield record
446
450 """Returns SeqRecord object iterator
451
452 Each CDS feature becomes a SeqRecord.
453
454 alphabet - Used for any sequence found in a translation field.
455 tags2id - Tupple of three strings, the feature keys to use
456 for the record id, name and description,
457
458 This method is intended for use in Bio.SeqIO
459 """
460 self.set_handle(handle)
461 while self.find_start():
462
463 self.parse_header()
464 feature_tuples = self.parse_features()
465
466 while True:
467 line = self.handle.readline()
468 if not line : break
469 if line[:2]=="//" : break
470 self.line = line.rstrip()
471
472
473 for key, location_string, qualifiers in feature_tuples:
474 if key=="CDS":
475
476
477
478
479
480 record = SeqRecord(seq=None)
481 annotations = record.annotations
482
483
484
485
486 annotations['raw_location'] = location_string.replace(' ','')
487
488 for (qualifier_name, qualifier_data) in qualifiers:
489 if qualifier_data is not None \
490 and qualifier_data[0]=='"' and qualifier_data[-1]=='"':
491
492 qualifier_data = qualifier_data[1:-1]
493
494 if qualifier_name == "translation":
495 assert record.seq is None, "Multiple translations!"
496 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
497 elif qualifier_name == "db_xref":
498
499 record.dbxrefs.append(qualifier_data)
500 else:
501 if qualifier_data is not None:
502 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
503 try:
504 annotations[qualifier_name] += " " + qualifier_data
505 except KeyError:
506
507 annotations[qualifier_name]= qualifier_data
508
509
510
511 try:
512 record.id = annotations[tags2id[0]]
513 except KeyError:
514 pass
515 try:
516 record.name = annotations[tags2id[1]]
517 except KeyError:
518 pass
519 try:
520 record.description = annotations[tags2id[2]]
521 except KeyError:
522 pass
523
524 yield record
525
526
528 """For extracting chunks of information in EMBL files"""
529
530 RECORD_START = "ID "
531 HEADER_WIDTH = 5
532 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
533 FEATURE_END_MARKERS = ["XX"]
534 FEATURE_QUALIFIER_INDENT = 21
535 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
536 SEQUENCE_HEADERS=["SQ", "CO"]
537
572
583
585
586
587
588 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
589 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
590 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
591 fields = [entry.strip() for entry in fields]
592 """
593 The tokens represent:
594 0. Primary accession number
595 (space sep)
596 1. ??? (e.g. standard)
597 (semi-colon)
598 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
599 3. Taxonomic division (e.g. 'PRO')
600 4. Sequence length (e.g. '4639675 BP.')
601 """
602 consumer.locus(fields[0])
603 consumer.residue_type(fields[2])
604 consumer.data_file_division(fields[3])
605 self._feed_seq_length(consumer, fields[4])
606
608
609
610
611 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
612 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
613 assert len(fields) == 7
614 """
615 The tokens represent:
616 0. Primary accession number
617 1. Sequence version number
618 2. Topology: 'circular' or 'linear'
619 3. Molecule type (e.g. 'genomic DNA')
620 4. Data class (e.g. 'STD')
621 5. Taxonomic division (e.g. 'PRO')
622 6. Sequence length (e.g. '4639675 BP.')
623 """
624
625 consumer.locus(fields[0])
626
627
628
629 consumer.accession(fields[0])
630
631
632
633 version_parts = fields[1].split()
634 if len(version_parts)==2 \
635 and version_parts[0]=="SV" \
636 and version_parts[1].isdigit():
637 consumer.version_suffix(version_parts[1])
638
639
640 consumer.residue_type(" ".join(fields[2:4]))
641
642
643
644 consumer.data_file_division(fields[5])
645
646 self._feed_seq_length(consumer, fields[6])
647
649 length_parts = text.split()
650 assert len(length_parts) == 2
651 assert length_parts[1].upper() in ["BP", "BP.", "AA."]
652 consumer.size(length_parts[0])
653
655 EMBL_INDENT = self.HEADER_WIDTH
656 EMBL_SPACER = " " * EMBL_INDENT
657 consumer_dict = {
658 'AC' : 'accession',
659 'SV' : 'version',
660 'DE' : 'definition',
661
662
663
664
665 'RG' : 'consrtm',
666
667
668 'RL' : 'journal',
669 'OS' : 'organism',
670 'OC' : 'taxonomy',
671
672 'CC' : 'comment',
673
674 }
675
676
677 for line in lines:
678 line_type = line[:EMBL_INDENT].strip()
679 data = line[EMBL_INDENT:].strip()
680 if line_type == 'XX':
681 pass
682 elif line_type == 'RN':
683
684
685 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
686 consumer.reference_num(data)
687 elif line_type == 'RP':
688
689
690
691 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")]
692 consumer.reference_bases("(bases %s)" % "; ".join(parts))
693 elif line_type == 'RT':
694
695
696 if data.startswith('"'):
697 data = data[1:]
698 if data.endswith('";'):
699 data = data[:-2]
700 consumer.title(data)
701 elif line_type == 'RX':
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717 key, value = data.split(";",1)
718 if value.endswith(".") : value = value[:-1]
719 value = value.strip()
720 if key == "PUBMED":
721 consumer.pubmed_id(value)
722
723 elif line_type == 'CC':
724
725 consumer.comment([data])
726 elif line_type == 'DR':
727
728
729
730
731
732
733
734 parts = data.rstrip(".").split(";")
735
736
737 consumer.dblink("%s:%s" % (parts[0].strip(),
738 parts[1].strip()))
739 elif line_type == 'RA':
740
741 consumer.authors(data.rstrip(";"))
742 elif line_type == 'PR':
743
744
745
746 consumer.project(data.rstrip(";"))
747 elif line_type in consumer_dict:
748
749 getattr(consumer, consumer_dict[line_type])(data)
750 else:
751 if self.debug:
752 print "Ignoring EMBL header line:\n%s" % line
753
755
756 lines.append("")
757 line_iter = iter(lines)
758 try:
759 for line in line_iter:
760 if line.startswith("CO "):
761 line = line[5:].strip()
762 contig_location = line
763 while True:
764 line = line_iter.next()
765 if not line:
766 break
767 elif line.startswith("CO "):
768
769 contig_location += line[5:].strip()
770 else:
771 raise ValueError('Expected CO (contig) continuation line, got:\n' + line)
772 consumer.contig_location(contig_location)
773 return
774 except StopIteration:
775 raise ValueError("Problem in misc lines before sequence")
776
777
779 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).
780
781 IMGT files are like EMBL files but in order to allow longer feature types
782 the features should be indented by 25 characters not 21 characters. In
783 practice the IMGT flat files tend to use either 21 or 25 characters, so we
784 must cope with both.
785
786 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
787 """
788
789 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers",
790 "FH Key Location/Qualifiers (from EMBL)",
791 "FH Key Location/Qualifiers",
792 "FH"]
793
795 """Return list of tuples for the features (if present)
796
797 Each feature is returned as a tuple (key, location, qualifiers)
798 where key and location are strings (e.g. "CDS" and
799 "complement(join(490883..490885,1..879))") while qualifiers
800 is a list of two string tuples (feature qualifier keys and values).
801
802 Assumes you have already read to the start of the features table.
803 """
804 if self.line.rstrip() not in self.FEATURE_START_MARKERS:
805 if self.debug : print "Didn't find any feature table"
806 return []
807
808 while self.line.rstrip() in self.FEATURE_START_MARKERS:
809 self.line = self.handle.readline()
810
811 bad_position_re = re.compile(r'([0-9]+)>{1}')
812
813 features = []
814 line = self.line
815 while True:
816 if not line:
817 raise ValueError("Premature end of line during features table")
818 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
819 if self.debug : print "Found start of sequence"
820 break
821 line = line.rstrip()
822 if line == "//":
823 raise ValueError("Premature end of features table, marker '//' found")
824 if line in self.FEATURE_END_MARKERS:
825 if self.debug : print "Found end of features"
826 line = self.handle.readline()
827 break
828 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "":
829
830
831 line = self.handle.readline()
832 continue
833
834 if skip:
835 line = self.handle.readline()
836 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER:
837 line = self.handle.readline()
838 else:
839 assert line[:2] == "FT"
840 try:
841 feature_key, location_start = line[2:].strip().split()
842 except ValueError:
843
844
845
846 feature_key = line[2:25].strip()
847 location_start = line[25:].strip()
848 feature_lines = [location_start]
849 line = self.handle.readline()
850 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \
851 or line.rstrip() == "" :
852
853
854 assert line[:2] == "FT"
855 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip())
856 line = self.handle.readline()
857 feature_key, location, qualifiers = \
858 self.parse_feature(feature_key, feature_lines)
859
860 if ">" in location:
861
862
863
864
865
866
867 location = bad_position_re.sub(r'>\1',location)
868 features.append((feature_key, location, qualifiers))
869 self.line = line
870 return features
871
873 """For extracting chunks of information in GenBank files"""
874
875 RECORD_START = "LOCUS "
876 HEADER_WIDTH = 12
877 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
878 FEATURE_END_MARKERS = []
879 FEATURE_QUALIFIER_INDENT = 21
880 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
881 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
882
926
928 """Scan over and parse GenBank LOCUS line (PRIVATE).
929
930 This must cope with several variants, primarily the old and new column
931 based standards from GenBank. Additionally EnsEMBL produces GenBank
932 files where the LOCUS line is space separated rather that following
933 the column based layout.
934
935 We also try to cope with GenBank like files with partial LOCUS lines.
936 """
937
938
939
940 GENBANK_INDENT = self.HEADER_WIDTH
941 GENBANK_SPACER = " "*GENBANK_INDENT
942 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
943 'LOCUS line does not start correctly:\n' + line
944
945
946
947 if line[29:33] in [' bp ', ' aa ',' rc '] and line[55:62] == ' ':
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969 assert line[41:42] == ' ', \
970 'LOCUS line does not contain space at position 42:\n' + line
971 assert line[42:51].strip() in ['','linear','circular'], \
972 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
973 assert line[51:52] == ' ', \
974 'LOCUS line does not contain space at position 52:\n' + line
975
976
977 if line[62:73].strip():
978 assert line[64:65] == '-', \
979 'LOCUS line does not contain - at position 65 in date:\n' + line
980 assert line[68:69] == '-', \
981 'LOCUS line does not contain - at position 69 in date:\n' + line
982
983 name_and_length_str = line[GENBANK_INDENT:29]
984 while name_and_length_str.find(' ')!=-1:
985 name_and_length_str = name_and_length_str.replace(' ',' ')
986 name_and_length = name_and_length_str.split(' ')
987 assert len(name_and_length)<=2, \
988 'Cannot parse the name and length in the LOCUS line:\n' + line
989 assert len(name_and_length)!=1, \
990 'Name and length collide in the LOCUS line:\n' + line
991
992
993
994 consumer.locus(name_and_length[0])
995 consumer.size(name_and_length[1])
996
997
998 if line[33:51].strip() == "" and line[29:33] == ' aa ':
999
1000
1001
1002
1003 consumer.residue_type("PROTEIN")
1004 else:
1005 consumer.residue_type(line[33:51].strip())
1006
1007 consumer.data_file_division(line[52:55])
1008 if line[62:73].strip():
1009 consumer.date(line[62:73])
1010 elif line[40:44] in [' bp ', ' aa ',' rc '] \
1011 and line[54:64].strip() in ['','linear','circular']:
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
1033 'LOCUS line does not contain size units at expected position:\n' + line
1034 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
1035 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
1036 assert line[47:54].strip() == "" \
1037 or line[47:54].strip().find('DNA') != -1 \
1038 or line[47:54].strip().find('RNA') != -1, \
1039 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
1040 assert line[54:55] == ' ', \
1041 'LOCUS line does not contain space at position 55:\n' + line
1042 assert line[55:63].strip() in ['','linear','circular'], \
1043 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
1044 assert line[63:64] == ' ', \
1045 'LOCUS line does not contain space at position 64:\n' + line
1046 assert line[67:68] == ' ', \
1047 'LOCUS line does not contain space at position 68:\n' + line
1048 if line[68:79].strip():
1049 assert line[70:71] == '-', \
1050 'LOCUS line does not contain - at position 71 in date:\n' + line
1051 assert line[74:75] == '-', \
1052 'LOCUS line does not contain - at position 75 in date:\n' + line
1053
1054 name_and_length_str = line[GENBANK_INDENT:40]
1055 while name_and_length_str.find(' ')!=-1:
1056 name_and_length_str = name_and_length_str.replace(' ',' ')
1057 name_and_length = name_and_length_str.split(' ')
1058 assert len(name_and_length)<=2, \
1059 'Cannot parse the name and length in the LOCUS line:\n' + line
1060 assert len(name_and_length)!=1, \
1061 'Name and length collide in the LOCUS line:\n' + line
1062
1063
1064
1065 consumer.locus(name_and_length[0])
1066 consumer.size(name_and_length[1])
1067
1068 if line[44:54].strip() == "" and line[40:44] == ' aa ':
1069
1070
1071
1072
1073 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
1074 else:
1075 consumer.residue_type(line[44:63].strip())
1076
1077 consumer.data_file_division(line[64:67])
1078 if line[68:79].strip():
1079 consumer.date(line[68:79])
1080 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096 if line[GENBANK_INDENT:].strip() != "":
1097 consumer.locus(line[GENBANK_INDENT:].strip())
1098 else:
1099
1100
1101 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line)
1102 elif len(line.split())==7 and line.split()[3] in ["aa","bp"]:
1103
1104
1105
1106
1107
1108
1109
1110
1111 splitline = line.split()
1112 consumer.locus(splitline[1])
1113 consumer.size(splitline[2])
1114 consumer.residue_type(splitline[4])
1115 consumer.data_file_division(splitline[5])
1116 consumer.date(splitline[6])
1117 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]:
1118
1119
1120 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
1121 consumer.locus(line.split()[1])
1122 consumer.size(line.split()[2])
1123 elif len(line.split())>=4 and line.split()[-1] in ["aa","bp"]:
1124
1125
1126
1127 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
1128 consumer.locus(line[5:].rsplit(None,2)[0].strip())
1129 consumer.size(line.split()[-2])
1130 else:
1131 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1132
1133
1135
1136
1137
1138
1139 GENBANK_INDENT = self.HEADER_WIDTH
1140 GENBANK_SPACER = " "*GENBANK_INDENT
1141 consumer_dict = {
1142 'DEFINITION' : 'definition',
1143 'ACCESSION' : 'accession',
1144 'NID' : 'nid',
1145 'PID' : 'pid',
1146 'DBSOURCE' : 'db_source',
1147 'KEYWORDS' : 'keywords',
1148 'SEGMENT' : 'segment',
1149 'SOURCE' : 'source',
1150 'AUTHORS' : 'authors',
1151 'CONSRTM' : 'consrtm',
1152 'PROJECT' : 'project',
1153 'DBLINK' : 'dblink',
1154 'TITLE' : 'title',
1155 'JOURNAL' : 'journal',
1156 'MEDLINE' : 'medline_id',
1157 'PUBMED' : 'pubmed_id',
1158 'REMARK' : 'remark'}
1159
1160
1161
1162
1163
1164
1165 lines = filter(None,lines)
1166 lines.append("")
1167 line_iter = iter(lines)
1168 try:
1169 line = line_iter.next()
1170 while True:
1171 if not line : break
1172 line_type = line[:GENBANK_INDENT].strip()
1173 data = line[GENBANK_INDENT:].strip()
1174
1175 if line_type == 'VERSION':
1176
1177
1178
1179 while data.find(' ')!=-1:
1180 data = data.replace(' ',' ')
1181 if data.find(' GI:')==-1:
1182 consumer.version(data)
1183 else:
1184 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1185 consumer.version(data.split(' GI:')[0])
1186 consumer.gi(data.split(' GI:')[1])
1187
1188 line = line_iter.next()
1189 elif line_type == 'REFERENCE':
1190 if self.debug >1 : print "Found reference [" + data + "]"
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201 data = data.strip()
1202
1203
1204 while True:
1205 line = line_iter.next()
1206 if line[:GENBANK_INDENT] == GENBANK_SPACER:
1207
1208 data += " " + line[GENBANK_INDENT:]
1209 if self.debug >1 : print "Extended reference text [" + data + "]"
1210 else:
1211
1212 break
1213
1214
1215
1216 while data.find(' ')!=-1:
1217 data = data.replace(' ',' ')
1218 if data.find(' ')==-1:
1219 if self.debug >2 : print 'Reference number \"' + data + '\"'
1220 consumer.reference_num(data)
1221 else:
1222 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1223 consumer.reference_num(data[:data.find(' ')])
1224 consumer.reference_bases(data[data.find(' ')+1:])
1225 elif line_type == 'ORGANISM':
1226
1227
1228
1229
1230
1231
1232
1233
1234 organism_data = data
1235 lineage_data = ""
1236 while True:
1237 line = line_iter.next()
1238 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1239 if lineage_data or ";" in line:
1240 lineage_data += " " + line[GENBANK_INDENT:]
1241 else:
1242 organism_data += " " + line[GENBANK_INDENT:].strip()
1243 else:
1244
1245 break
1246 consumer.organism(organism_data)
1247 if lineage_data.strip() == "" and self.debug > 1:
1248 print "Taxonomy line(s) missing or blank"
1249 consumer.taxonomy(lineage_data.strip())
1250 del organism_data, lineage_data
1251 elif line_type == 'COMMENT':
1252 if self.debug > 1 : print "Found comment"
1253
1254
1255 comment_list=[]
1256 comment_list.append(data)
1257 while True:
1258 line = line_iter.next()
1259 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1260 data = line[GENBANK_INDENT:]
1261 comment_list.append(data)
1262 if self.debug > 2 : print "Comment continuation [" + data + "]"
1263 else:
1264
1265 break
1266 consumer.comment(comment_list)
1267 del comment_list
1268 elif line_type in consumer_dict:
1269
1270
1271 while True:
1272 line = line_iter.next()
1273 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1274 data += ' ' + line[GENBANK_INDENT:]
1275 else:
1276
1277 getattr(consumer, consumer_dict[line_type])(data)
1278
1279 break
1280 else:
1281 if self.debug:
1282 print "Ignoring GenBank header line:\n" % line
1283
1284 line = line_iter.next()
1285 except StopIteration:
1286 raise ValueError("Problem in header")
1287
1328
1329 if __name__ == "__main__":
1330 from StringIO import StringIO
1331
1332 gbk_example = \
1333 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1334 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1335 (AXL2) and Rev7p (REV7) genes, complete cds.
1336 ACCESSION U49845
1337 VERSION U49845.1 GI:1293613
1338 KEYWORDS .
1339 SOURCE Saccharomyces cerevisiae (baker's yeast)
1340 ORGANISM Saccharomyces cerevisiae
1341 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1342 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1343 REFERENCE 1 (bases 1 to 5028)
1344 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1345 TITLE Cloning and sequence of REV7, a gene whose function is required for
1346 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1347 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1348 PUBMED 7871890
1349 REFERENCE 2 (bases 1 to 5028)
1350 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1351 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1352 plasma membrane glycoprotein
1353 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1354 PUBMED 8846915
1355 REFERENCE 3 (bases 1 to 5028)
1356 AUTHORS Roemer,T.
1357 TITLE Direct Submission
1358 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1359 Haven, CT, USA
1360 FEATURES Location/Qualifiers
1361 source 1..5028
1362 /organism="Saccharomyces cerevisiae"
1363 /db_xref="taxon:4932"
1364 /chromosome="IX"
1365 /map="9"
1366 CDS <1..206
1367 /codon_start=3
1368 /product="TCP1-beta"
1369 /protein_id="AAA98665.1"
1370 /db_xref="GI:1293614"
1371 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1372 AEVLLRVDNIIRARPRTANRQHM"
1373 gene 687..3158
1374 /gene="AXL2"
1375 CDS 687..3158
1376 /gene="AXL2"
1377 /note="plasma membrane glycoprotein"
1378 /codon_start=1
1379 /function="required for axial budding pattern of S.
1380 cerevisiae"
1381 /product="Axl2p"
1382 /protein_id="AAA98666.1"
1383 /db_xref="GI:1293615"
1384 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1385 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1386 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1387 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1388 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1389 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1390 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1391 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1392 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1393 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1394 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1395 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1396 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1397 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1398 VDFSNKSNVNVGQVKDIHGRIPEML"
1399 gene complement(3300..4037)
1400 /gene="REV7"
1401 CDS complement(3300..4037)
1402 /gene="REV7"
1403 /codon_start=1
1404 /product="Rev7p"
1405 /protein_id="AAA98667.1"
1406 /db_xref="GI:1293616"
1407 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1408 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1409 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1410 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1411 LISGDDKILNGVYSQYEEGESIFGSLF"
1412 ORIGIN
1413 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1414 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1415 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1416 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1417 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1418 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1419 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1420 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1421 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1422 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1423 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1424 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1425 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1426 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1427 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1428 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1429 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1430 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1431 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1432 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1433 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1434 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1435 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1436 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1437 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1438 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1439 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1440 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1441 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1442 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1443 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1444 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1445 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1446 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1447 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1448 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1449 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1450 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1451 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1452 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1453 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1454 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1455 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1456 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1457 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1458 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1459 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1460 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1461 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1462 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1463 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1464 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1465 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1466 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1467 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1468 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1469 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1470 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1471 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1472 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1473 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1474 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1475 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1476 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1477 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1478 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1479 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1480 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1481 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1482 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1483 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1484 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1485 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1486 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1487 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1488 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1489 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1490 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1491 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1492 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1493 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1494 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1495 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1496 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1497 //"""
1498
1499
1500
1501 gbk_example2 = \
1502 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1503 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1504 ACCESSION AAD51968
1505 VERSION AAD51968.1 GI:5805369
1506 DBSOURCE locus AF171097 accession AF171097.1
1507 KEYWORDS .
1508 SOURCE Yersinia enterocolitica
1509 ORGANISM Yersinia enterocolitica
1510 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1511 Enterobacteriaceae; Yersinia.
1512 REFERENCE 1 (residues 1 to 143)
1513 AUTHORS Revell,P.A. and Miller,V.L.
1514 TITLE A chromosomally encoded regulator is required for expression of the
1515 Yersinia enterocolitica inv gene and for virulence
1516 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1517 MEDLINE 20138369
1518 PUBMED 10672189
1519 REFERENCE 2 (residues 1 to 143)
1520 AUTHORS Revell,P.A. and Miller,V.L.
1521 TITLE Direct Submission
1522 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1523 University School of Medicine, Campus Box 8230, 660 South Euclid,
1524 St. Louis, MO 63110, USA
1525 COMMENT Method: conceptual translation.
1526 FEATURES Location/Qualifiers
1527 source 1..143
1528 /organism="Yersinia enterocolitica"
1529 /mol_type="unassigned DNA"
1530 /strain="JB580v"
1531 /serotype="O:8"
1532 /db_xref="taxon:630"
1533 Protein 1..143
1534 /product="transcriptional regulator RovA"
1535 /name="regulates inv expression"
1536 CDS 1..143
1537 /gene="rovA"
1538 /coded_by="AF171097.1:380..811"
1539 /note="regulator of virulence"
1540 /transl_table=11
1541 ORIGIN
1542 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1543 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1544 121 deiellsgli dklerniiql qsk
1545 //
1546 """
1547
1548 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1549 XX
1550 AC X56734; S46826;
1551 XX
1552 DT 12-SEP-1991 (Rel. 29, Created)
1553 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1554 XX
1555 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1556 XX
1557 KW beta-glucosidase.
1558 XX
1559 OS Trifolium repens (white clover)
1560 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1561 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1562 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1563 XX
1564 RN [5]
1565 RP 1-1859
1566 RX PUBMED; 1907511.
1567 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1568 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1569 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1570 RL Plant Mol. Biol. 17(2):209-219(1991).
1571 XX
1572 RN [6]
1573 RP 1-1859
1574 RA Hughes M.A.;
1575 RT ;
1576 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1577 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1578 RL Upon Tyne, NE2 4HH, UK
1579 XX
1580 FH Key Location/Qualifiers
1581 FH
1582 FT source 1..1859
1583 FT /organism="Trifolium repens"
1584 FT /mol_type="mRNA"
1585 FT /clone_lib="lambda gt10"
1586 FT /clone="TRE361"
1587 FT /tissue_type="leaves"
1588 FT /db_xref="taxon:3899"
1589 FT CDS 14..1495
1590 FT /product="beta-glucosidase"
1591 FT /EC_number="3.2.1.21"
1592 FT /note="non-cyanogenic"
1593 FT /db_xref="GOA:P26204"
1594 FT /db_xref="InterPro:IPR001360"
1595 FT /db_xref="InterPro:IPR013781"
1596 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1597 FT /protein_id="CAA40058.1"
1598 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1599 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1600 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1601 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1602 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1603 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1604 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1605 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1606 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1607 FT mRNA 1..1859
1608 FT /experiment="experimental evidence, no additional details
1609 FT recorded"
1610 XX
1611 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1612 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1613 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1614 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1615 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1616 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1617 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1618 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1619 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1620 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1621 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1622 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1623 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1624 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1625 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1626 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1627 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1628 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1629 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1630 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1631 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1632 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1633 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1634 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1635 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1636 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1637 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1638 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1639 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1640 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1641 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1642 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1643 //
1644 """
1645
1646 print "GenBank CDS Iteration"
1647 print "====================="
1648
1649 g = GenBankScanner()
1650 for record in g.parse_cds_features(StringIO(gbk_example)):
1651 print record
1652
1653 g = GenBankScanner()
1654 for record in g.parse_cds_features(StringIO(gbk_example2),
1655 tags2id=('gene','locus_tag','product')):
1656 print record
1657
1658 g = GenBankScanner()
1659 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1660 tags2id=('gene','locus_tag','product')):
1661 print record
1662
1663 print
1664 print "GenBank Iteration"
1665 print "================="
1666 g = GenBankScanner()
1667 for record in g.parse_records(StringIO(gbk_example),do_features=False):
1668 print record.id, record.name, record.description
1669 print record.seq
1670
1671 g = GenBankScanner()
1672 for record in g.parse_records(StringIO(gbk_example),do_features=True):
1673 print record.id, record.name, record.description
1674 print record.seq
1675
1676 g = GenBankScanner()
1677 for record in g.parse_records(StringIO(gbk_example2),do_features=False):
1678 print record.id, record.name, record.description
1679 print record.seq
1680
1681 g = GenBankScanner()
1682 for record in g.parse_records(StringIO(gbk_example2),do_features=True):
1683 print record.id, record.name, record.description
1684 print record.seq
1685
1686 print
1687 print "EMBL CDS Iteration"
1688 print "=================="
1689
1690 e = EmblScanner()
1691 for record in e.parse_cds_features(StringIO(embl_example)):
1692 print record
1693
1694 print
1695 print "EMBL Iteration"
1696 print "=============="
1697 e = EmblScanner()
1698 for record in e.parse_records(StringIO(embl_example),do_features=True):
1699 print record.id, record.name, record.description
1700 print record.seq
1701