1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 ErrorFeatureParser Catch errors caused during parsing.
25 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
26 RecordParser Parse GenBank data into a Record object.
27
28 Exceptions:
29 ParserFailureError Exception indicating a failure in the parser (ie.
30 scanner or consumer)
31 LocationParserError Exception indiciating a problem with the spark based
32 location parser.
33
34
35 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
36 These are GenBank files that summarize the content of a project, and provide lists of
37 scaffold and contig files in the project. These will be in annotations['wgs'] and
38 annotations['wgs_scafld']. These GenBank files do not have sequences. See
39 http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
40
41 http://is.gd/nNgk
42 for more details of this format, and an example.
43 Added by Ying Huang & Iddo Friedberg
44 """
45 import cStringIO
46 import re
47
48
49 from Bio import SeqFeature
50 from Bio.ParserSupport import AbstractConsumer
51 from Bio import Entrez
52
53
54 from utils import FeatureValueCleaner
55 from Scanner import GenBankScanner
56
57
58 GENBANK_INDENT = 12
59 GENBANK_SPACER = " " * GENBANK_INDENT
60
61
62 FEATURE_KEY_INDENT = 5
63 FEATURE_QUALIFIER_INDENT = 21
64 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
65 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
66
67
68 _solo_location = r"[<>]?\d+"
69 _pair_location = r"[<>]?\d+\.\.[<>]?\d+"
70 _between_location = r"\d+\^\d+"
71
72 _within_position = r"\(\d+\.\d+\)"
73 _re_within_position = re.compile(_within_position)
74 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
75 % (_within_position,_within_position)
76 assert _re_within_position.match("(3.9)")
77 assert re.compile(_within_location).match("(3.9)..10")
78 assert re.compile(_within_location).match("26..(30.33)")
79 assert re.compile(_within_location).match("(13.19)..(20.28)")
80
81 _oneof_position = r"one\-of\(\d+(,\d+)+\)"
82 _re_oneof_position = re.compile(_oneof_position)
83 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
84 % (_oneof_position,_oneof_position)
85 assert _re_oneof_position.match("one-of(6,9)")
86 assert re.compile(_oneof_location).match("one-of(6,9)..101")
87 assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)")
88 assert re.compile(_oneof_location).match("6..one-of(101,104)")
89
90 assert not _re_oneof_position.match("one-of(3)")
91 assert _re_oneof_position.match("one-of(3,6)")
92 assert _re_oneof_position.match("one-of(3,6,9)")
93
94
95 _simple_location = r"\d+\.\.\d+"
96 _re_simple_location = re.compile(_simple_location)
97 _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \
98 % (_simple_location, _simple_location))
99 _complex_location = r"([a-zA-z][a-zA-Z0-9]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \
100 % (_pair_location, _solo_location, _between_location,
101 _within_location, _oneof_location)
102 _re_complex_location = re.compile(r"^%s$" % _complex_location)
103 _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \
104 % (_complex_location, _complex_location)
105 _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" \
106 % (_possibly_complemented_complex_location,
107 _possibly_complemented_complex_location))
108
109 assert _re_simple_location.match("104..160")
110 assert not _re_simple_location.match("<104..>160")
111 assert not _re_simple_location.match("104")
112 assert not _re_simple_location.match("<1")
113 assert not _re_simple_location.match(">99999")
114 assert not _re_simple_location.match("join(104..160,320..390,504..579)")
115 assert not _re_simple_compound.match("bond(12,63)")
116 assert _re_simple_compound.match("join(104..160,320..390,504..579)")
117 assert _re_simple_compound.match("order(1..69,1308..1465)")
118 assert not _re_simple_compound.match("order(1..69,1308..1465,1524)")
119 assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)")
120 assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)")
121 assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)")
122 assert not _re_simple_compound.match("test(1..69,1308..1465)")
123 assert not _re_simple_compound.match("complement(1..69)")
124 assert not _re_simple_compound.match("(1..69)")
125 assert _re_complex_location.match("(3.9)..10")
126 assert _re_complex_location.match("26..(30.33)")
127 assert _re_complex_location.match("(13.19)..(20.28)")
128 assert _re_complex_location.match("41^42")
129 assert _re_complex_location.match("AL121804:41^42")
130 assert _re_complex_location.match("AL121804:41..610")
131 assert _re_complex_location.match("AL121804.2:41..610")
132 assert _re_complex_location.match("one-of(3,6)..101")
133 assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
134 assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
135 assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)")
136
137 -def _pos(pos_str, offset=0):
138 """Build a Position object (PRIVATE).
139
140 For an end position, leave offset as zero (default):
141
142 >>> _pos("5")
143 ExactPosition(5)
144
145 For a start position, set offset to minus one (for Python counting):
146
147 >>> _pos("5", -1)
148 ExactPosition(4)
149
150 This also covers fuzzy positions:
151
152 >>> _pos("<5")
153 BeforePosition(5)
154 >>> _pos(">5")
155 AfterPosition(5)
156 >>> _pos("one-of(5,8,11)")
157 OneOfPosition([ExactPosition(5), ExactPosition(8), ExactPosition(11)])
158 >>> _pos("(8.10)")
159 WithinPosition(8,2)
160 """
161 if pos_str.startswith("<"):
162 return SeqFeature.BeforePosition(int(pos_str[1:])+offset)
163 elif pos_str.startswith(">"):
164 return SeqFeature.AfterPosition(int(pos_str[1:])+offset)
165 elif _re_within_position.match(pos_str):
166 s,e = pos_str[1:-1].split(".")
167 return SeqFeature.WithinPosition(int(s)+offset, int(e)-int(s))
168 elif _re_oneof_position.match(pos_str):
169 assert pos_str.startswith("one-of(")
170 assert pos_str[-1]==")"
171 parts = [SeqFeature.ExactPosition(int(pos)+offset) \
172 for pos in pos_str[7:-1].split(",")]
173 return SeqFeature.OneOfPosition(parts)
174 else:
175 return SeqFeature.ExactPosition(int(pos_str)+offset)
176
177 -def _loc(loc_str, expected_seq_length):
178 """FeatureLocation from non-compound non-complement location (PRIVATE).
179
180 Simple examples,
181
182 >>> _loc("123..456", 1000)
183 FeatureLocation(ExactPosition(122),ExactPosition(456))
184 >>> _loc("<123..>456", 1000)
185 FeatureLocation(BeforePosition(122),AfterPosition(456))
186
187 A more complex location using within positions,
188
189 >>> _loc("(9.10)..(20.25)", 1000)
190 FeatureLocation(WithinPosition(8,1),WithinPosition(20,5))
191
192 Zero length between feature,
193
194 >>> _loc("123^124", 1000)
195 FeatureLocation(ExactPosition(123),ExactPosition(123))
196
197 The expected sequence length is needed for a special case, a between
198 position at the start/end of a circular genome:
199
200 >>> _loc("1000^1", 1000)
201 FeatureLocation(ExactPosition(1000),ExactPosition(1000))
202
203 Apart from this special case, between positions P^Q must have P+1==Q,
204
205 >>> _loc("123^456", 1000)
206 Traceback (most recent call last):
207 ...
208 ValueError: Invalid between location '123^456'
209 """
210 try:
211 s, e = loc_str.split("..")
212 except ValueError:
213 assert ".." not in loc_str
214 if "^" in loc_str:
215
216
217
218
219
220
221
222 s, e = loc_str.split("^")
223 if int(s)+1==int(e):
224 pos = _pos(s)
225 elif int(s)==expected_seq_length and e=="1":
226 pos = _pos(s)
227 else:
228 raise ValueError("Invalid between location %s" % repr(loc_str))
229 return SeqFeature.FeatureLocation(pos, pos)
230 else:
231
232 s = loc_str
233 e = loc_str
234 return SeqFeature.FeatureLocation(_pos(s,-1), _pos(e))
235
237 """Split a tricky compound location string (PRIVATE).
238
239 >>> list(_split_compound_loc("123..145"))
240 ['123..145']
241 >>> list(_split_compound_loc("123..145,200..209"))
242 ['123..145', '200..209']
243 >>> list(_split_compound_loc("one-of(200,203)..300"))
244 ['one-of(200,203)..300']
245 >>> list(_split_compound_loc("complement(123..145),200..209"))
246 ['complement(123..145)', '200..209']
247 >>> list(_split_compound_loc("123..145,one-of(200,203)..209"))
248 ['123..145', 'one-of(200,203)..209']
249 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300"))
250 ['123..145', 'one-of(200,203)..one-of(209,211)', '300']
251 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300"))
252 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300']
253 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300"))
254 ['123..145', '200..one-of(209,211)', '300']
255 >>> list(_split_compound_loc("123..145,200..one-of(209,211)"))
256 ['123..145', '200..one-of(209,211)']
257 """
258 if "one-of(" in compound_loc:
259
260 while "," in compound_loc:
261 assert compound_loc[0] != ","
262 assert compound_loc[0:2] != ".."
263 i = compound_loc.find(",")
264 part = compound_loc[:i]
265 compound_loc = compound_loc[i:]
266 while part.count("(") > part.count(")"):
267 assert "one-of(" in part, (part, compound_loc)
268 i = compound_loc.find(")")
269 part += compound_loc[:i+1]
270 compound_loc = compound_loc[i+1:]
271 if compound_loc.startswith(".."):
272 i = compound_loc.find(",")
273 if i==-1:
274 part += compound_loc
275 compound_loc = ""
276 else:
277 part += compound_loc[:i]
278 compound_loc = compound_loc[i:]
279 while part.count("(") > part.count(")"):
280 assert part.count("one-of(") == 2
281 i = compound_loc.find(")")
282 part += compound_loc[:i+1]
283 compound_loc = compound_loc[i+1:]
284 if compound_loc.startswith(","):
285 compound_loc = compound_loc[1:]
286 assert part
287 yield part
288 if compound_loc:
289 yield compound_loc
290 else:
291
292 for part in compound_loc.split(","):
293 yield part
294
296 """Iterator interface to move over a file of GenBank entries one at a time.
297 """
298 - def __init__(self, handle, parser = None):
299 """Initialize the iterator.
300
301 Arguments:
302 o handle - A handle with GenBank entries to iterate through.
303 o parser - An optional parser to pass the entries through before
304 returning them. If None, then the raw entry will be returned.
305 """
306 self.handle = handle
307 self._parser = parser
308
310 """Return the next GenBank record from the handle.
311
312 Will return None if we ran out of records.
313 """
314 if self._parser is None:
315 lines = []
316 while True:
317 line = self.handle.readline()
318 if not line : return None
319 lines.append(line)
320 if line.rstrip() == "//" : break
321 return "".join(lines)
322 try:
323 return self._parser.parse(self.handle)
324 except StopIteration:
325 return None
326
328 return iter(self.next, None)
329
331 """Failure caused by some kind of problem in the parser.
332 """
333 pass
334
336 """Could not Properly parse out a location from a GenBank file.
337 """
338 pass
339
341 """Parse GenBank files into Seq + Feature objects.
342 """
345 """Initialize a GenBank parser and Feature consumer.
346
347 Arguments:
348 o debug_level - An optional argument that species the amount of
349 debugging information the parser should spit out. By default we have
350 no debugging info (the fastest way to do things), but if you want
351 you can set this as high as two and see exactly where a parse fails.
352 o use_fuzziness - Specify whether or not to use fuzzy representations.
353 The default is 1 (use fuzziness).
354 o feature_cleaner - A class which will be used to clean out the
355 values of features. This class must implement the function
356 clean_value. GenBank.utils has a "standard" cleaner class, which
357 is used by default.
358 """
359 self._scanner = GenBankScanner(debug_level)
360 self.use_fuzziness = use_fuzziness
361 self._cleaner = feature_cleaner
362
363 - def parse(self, handle):
364 """Parse the specified handle.
365 """
366 self._consumer = _FeatureConsumer(self.use_fuzziness,
367 self._cleaner)
368 self._scanner.feed(handle, self._consumer)
369 return self._consumer.data
370
372 """Parse GenBank files into Record objects
373 """
375 """Initialize the parser.
376
377 Arguments:
378 o debug_level - An optional argument that species the amount of
379 debugging information the parser should spit out. By default we have
380 no debugging info (the fastest way to do things), but if you want
381 you can set this as high as two and see exactly where a parse fails.
382 """
383 self._scanner = GenBankScanner(debug_level)
384
385 - def parse(self, handle):
386 """Parse the specified handle into a GenBank record.
387 """
388 self._consumer = _RecordConsumer()
389 self._scanner.feed(handle, self._consumer)
390 return self._consumer.data
391
393 """Abstract GenBank consumer providing useful general functions.
394
395 This just helps to eliminate some duplication in things that most
396 GenBank consumers want to do.
397 """
398
399
400
401
402 remove_space_keys = ["translation"]
403
406
408 """Split a string of keywords into a nice clean list.
409 """
410
411 if keyword_string == "" or keyword_string == ".":
412 keywords = ""
413 elif keyword_string[-1] == '.':
414 keywords = keyword_string[:-1]
415 else:
416 keywords = keyword_string
417 keyword_list = keywords.split(';')
418 clean_keyword_list = [x.strip() for x in keyword_list]
419 return clean_keyword_list
420
422 """Split a string of accession numbers into a list.
423 """
424
425
426 accession = accession_string.replace("\n", " ").replace(";"," ")
427
428 return [x.strip() for x in accession.split() if x.strip()]
429
431 """Split a string with taxonomy info into a list.
432 """
433 if not taxonomy_string or taxonomy_string==".":
434
435 return []
436
437 if taxonomy_string[-1] == '.':
438 tax_info = taxonomy_string[:-1]
439 else:
440 tax_info = taxonomy_string
441 tax_list = tax_info.split(';')
442 new_tax_list = []
443 for tax_item in tax_list:
444 new_items = tax_item.split("\n")
445 new_tax_list.extend(new_items)
446 while '' in new_tax_list:
447 new_tax_list.remove('')
448 clean_tax_list = [x.strip() for x in new_tax_list]
449
450 return clean_tax_list
451
453 """Clean whitespace out of a location string.
454
455 The location parser isn't a fan of whitespace, so we clean it out
456 before feeding it into the parser.
457 """
458
459
460
461 return ''.join(location_string.split())
462
464 """Remove any newlines in the passed text, returning the new string.
465 """
466
467 newlines = ["\n", "\r"]
468 for ws in newlines:
469 text = text.replace(ws, "")
470
471 return text
472
474 """Replace multiple spaces in the passed text with single spaces.
475 """
476
477 text_parts = text.split(" ")
478 text_parts = filter(None, text_parts)
479 return ' '.join(text_parts)
480
482 """Remove all spaces from the passed text.
483 """
484 return text.replace(" ", "")
485
487 """Convert a start and end range to python notation.
488
489 In GenBank, starts and ends are defined in "biological" coordinates,
490 where 1 is the first base and [i, j] means to include both i and j.
491
492 In python, 0 is the first base and [i, j] means to include i, but
493 not j.
494
495 So, to convert "biological" to python coordinates, we need to
496 subtract 1 from the start, and leave the end and things should
497 be converted happily.
498 """
499 new_start = start - 1
500 new_end = end
501
502 return new_start, new_end
503
505 """Create a SeqRecord object with Features to return.
506
507 Attributes:
508 o use_fuzziness - specify whether or not to parse with fuzziness in
509 feature locations.
510 o feature_cleaner - a class that will be used to provide specialized
511 cleaning-up of feature values.
512 """
513 - def __init__(self, use_fuzziness, feature_cleaner = None):
514 from Bio.SeqRecord import SeqRecord
515 _BaseGenBankConsumer.__init__(self)
516 self.data = SeqRecord(None, id = None)
517 self.data.id = None
518 self.data.description = ""
519
520 self._use_fuzziness = use_fuzziness
521 self._feature_cleaner = feature_cleaner
522
523 self._seq_type = ''
524 self._seq_data = []
525 self._cur_reference = None
526 self._cur_feature = None
527 self._cur_qualifier_key = None
528 self._cur_qualifier_value = None
529 self._expected_size = None
530
531 - def locus(self, locus_name):
532 """Set the locus name is set as the name of the Sequence.
533 """
534 self.data.name = locus_name
535
536 - def size(self, content):
537 """Record the sequence length."""
538 self._expected_size = int(content)
539
541 """Record the sequence type so we can choose an appropriate alphabet.
542 """
543 self._seq_type = type
544
547
548 - def date(self, submit_date):
550
560
562 """Set the accession number as the id of the sequence.
563
564 If we have multiple accession numbers, the first one passed is
565 used.
566 """
567 new_acc_nums = self._split_accessions(acc_num)
568
569
570 try:
571
572 for acc in new_acc_nums:
573
574 if acc not in self.data.annotations['accessions']:
575 self.data.annotations['accessions'].append(acc)
576 except KeyError:
577 self.data.annotations['accessions'] = new_acc_nums
578
579
580 if self.data.id is None:
581 if len(new_acc_nums) > 0:
582
583
584 self.data.id = self.data.annotations['accessions'][0]
585
586 - def wgs(self, content):
588
591
592 - def nid(self, content):
594
595 - def pid(self, content):
597
610
612 """Handle the information from the PROJECT line as a list of projects.
613
614 e.g.
615 PROJECT GenomeProject:28471
616
617 or:
618 PROJECT GenomeProject:13543 GenomeProject:99999
619
620 This is stored as dbxrefs in the SeqRecord to be consistent with the
621 projected switch of this line to DBLINK in future GenBank versions.
622 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
623 "Project:28471" as part of this transition.
624 """
625 content = content.replace("GenomeProject:", "Project:")
626 self.data.dbxrefs.extend([p for p in content.split() if p])
627
629 """Store DBLINK cross references as dbxrefs in our record object.
630
631 This line type is expected to replace the PROJECT line in 2009. e.g.
632
633 During transition:
634
635 PROJECT GenomeProject:28471
636 DBLINK Project:28471
637 Trace Assembly Archive:123456
638
639 Once the project line is dropped:
640
641 DBLINK Project:28471
642 Trace Assembly Archive:123456
643
644 Note GenomeProject -> Project.
645
646 We'll have to see some real examples to be sure, but based on the
647 above example we can expect one reference per line.
648 """
649
650
651 if content.strip() not in self.data.dbxrefs:
652 self.data.dbxrefs.append(content.strip())
653
655 """Set the version to overwrite the id.
656
657 Since the verison provides the same information as the accession
658 number, plus some extra info, we set this as the id if we have
659 a version.
660 """
661
662
663
664
665
666
667
668
669
670
671 assert version.isdigit()
672 self.data.annotations['sequence_version'] = int(version)
673
676
677 - def gi(self, content):
679
682
685
687
688
689 if content == "":
690 source_info = ""
691 elif content[-1] == '.':
692 source_info = content[:-1]
693 else:
694 source_info = content
695 self.data.annotations['source'] = source_info
696
699
708
710 """Signal the beginning of a new reference object.
711 """
712
713
714 if self._cur_reference is not None:
715 self.data.annotations['references'].append(self._cur_reference)
716 else:
717 self.data.annotations['references'] = []
718
719 self._cur_reference = SeqFeature.Reference()
720
722 """Attempt to determine the sequence region the reference entails.
723
724 Possible types of information we may have to deal with:
725
726 (bases 1 to 86436)
727 (sites)
728 (bases 1 to 105654; 110423 to 111122)
729 1 (residues 1 to 182)
730 """
731
732 ref_base_info = content[1:-1]
733
734 all_locations = []
735
736 if ref_base_info.find('bases') != -1 and \
737 ref_base_info.find('to') != -1:
738
739 ref_base_info = ref_base_info[5:]
740 locations = self._split_reference_locations(ref_base_info)
741 all_locations.extend(locations)
742 elif (ref_base_info.find("residues") >= 0 and
743 ref_base_info.find("to") >= 0):
744 residues_start = ref_base_info.find("residues")
745
746 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
747 locations = self._split_reference_locations(ref_base_info)
748 all_locations.extend(locations)
749
750
751
752 elif (ref_base_info == 'sites' or
753 ref_base_info.strip() == 'bases'):
754 pass
755
756 else:
757 raise ValueError("Could not parse base info %s in record %s" %
758 (ref_base_info, self.data.id))
759
760 self._cur_reference.location = all_locations
761
763 """Get reference locations out of a string of reference information
764
765 The passed string should be of the form:
766
767 1 to 20; 20 to 100
768
769 This splits the information out and returns a list of location objects
770 based on the reference locations.
771 """
772
773 all_base_info = location_string.split(';')
774
775 new_locations = []
776 for base_info in all_base_info:
777 start, end = base_info.split('to')
778 new_start, new_end = \
779 self._convert_to_python_numbers(int(start.strip()),
780 int(end.strip()))
781 this_location = SeqFeature.FeatureLocation(new_start, new_end)
782 new_locations.append(this_location)
783 return new_locations
784
786 if self._cur_reference.authors:
787 self._cur_reference.authors += ' ' + content
788 else:
789 self._cur_reference.authors = content
790
792 if self._cur_reference.consrtm:
793 self._cur_reference.consrtm += ' ' + content
794 else:
795 self._cur_reference.consrtm = content
796
797 - def title(self, content):
798 if self._cur_reference is None:
799 import warnings
800 from Bio import BiopythonParserWarning
801 warnings.warn("GenBank TITLE line without REFERENCE line.",
802 BiopythonParserWarning)
803 elif self._cur_reference.title:
804 self._cur_reference.title += ' ' + content
805 else:
806 self._cur_reference.title = content
807
809 if self._cur_reference.journal:
810 self._cur_reference.journal += ' ' + content
811 else:
812 self._cur_reference.journal = content
813
816
819
821 """Deal with a reference comment."""
822 if self._cur_reference.comment:
823 self._cur_reference.comment += ' ' + content
824 else:
825 self._cur_reference.comment = content
826
832
834 """Get ready for the feature table when we reach the FEATURE line.
835 """
836 self.start_feature_table()
837
839 """Indicate we've got to the start of the feature table.
840 """
841
842 if self._cur_reference is not None:
843 self.data.annotations['references'].append(self._cur_reference)
844 self._cur_reference = None
845
847 """Utility function to add a feature to the SeqRecord.
848
849 This does all of the appropriate checking to make sure we haven't
850 left any info behind, and that we are only adding info if it
851 exists.
852 """
853 if self._cur_feature:
854
855
856 self._add_qualifier()
857
858 self._cur_qualifier_key = ''
859 self._cur_qualifier_value = ''
860 self.data.features.append(self._cur_feature)
861
875
877 """Parse out location information from the location string.
878
879 This uses simple Python code with some regular expressions to do the
880 parsing, and then translates the results into appropriate objects.
881 """
882
883
884 location_line = self._clean_location(content)
885
886
887
888
889
890
891 if location_line.find('replace') != -1:
892 comma_pos = location_line.find(',')
893 location_line = location_line[8:comma_pos]
894
895 cur_feature = self._cur_feature
896
897
898 if location_line.startswith("complement("):
899 assert location_line.endswith(")")
900 location_line = location_line[11:-1]
901 cur_feature.strand = -1
902
903
904
905 if _re_simple_location.match(location_line):
906
907 s, e = location_line.split("..")
908 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1,
909 int(e))
910 return
911 if _re_simple_compound.match(location_line):
912
913 i = location_line.find("(")
914 cur_feature.location_operator = location_line[:i]
915
916 for part in location_line[i+1:-1].split(","):
917 s, e = part.split("..")
918 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1,
919 int(e)),
920 location_operator=cur_feature.location_operator,
921 strand=cur_feature.strand, type=cur_feature.type)
922 cur_feature.sub_features.append(f)
923 s = cur_feature.sub_features[0].location.start
924 e = cur_feature.sub_features[-1].location.end
925 cur_feature.location = SeqFeature.FeatureLocation(s,e)
926 return
927
928
929 if _re_complex_location.match(location_line):
930
931 if ":" in location_line:
932 cur_feature.ref, location_line = location_line.split(":")
933 cur_feature.location = _loc(location_line, self._expected_size)
934 return
935 if _re_complex_compound.match(location_line):
936 i = location_line.find("(")
937 cur_feature.location_operator = location_line[:i]
938
939 for part in _split_compound_loc(location_line[i+1:-1]):
940 if part.startswith("complement("):
941 assert part[-1]==")"
942 part = part[11:-1]
943 assert cur_feature.strand != -1, "Double complement?"
944 strand = -1
945 else:
946 strand = cur_feature.strand
947 if ":" in part:
948 ref, part = part.split(":")
949 else:
950 ref = None
951 try:
952 loc = _loc(part, self._expected_size)
953 except ValueError, err:
954 print location_line
955 print part
956 raise err
957 f = SeqFeature.SeqFeature(location=loc, ref=ref,
958 location_operator=cur_feature.location_operator,
959 strand=strand, type=cur_feature.type)
960 cur_feature.sub_features.append(f)
961
962
963
964
965
966
967 strands = set(sf.strand for sf in cur_feature.sub_features)
968 if len(strands)==1:
969 cur_feature.strand = cur_feature.sub_features[0].strand
970 else:
971 cur_feature.strand = None
972 s = cur_feature.sub_features[0].location.start
973 e = cur_feature.sub_features[-1].location.end
974 cur_feature.location = SeqFeature.FeatureLocation(s,e)
975 return
976
977 if "order" in location_line and "join" in location_line:
978
979 msg = 'Combinations of "join" and "order" within the same ' + \
980 'location (nested operators) are illegal:\n' + location_line
981 raise LocationParserError(msg)
982 raise LocationParserError(location_line)
983
985 """Add a qualifier to the current feature without loss of info.
986
987 If there are multiple qualifier keys with the same name we
988 would lose some info in the dictionary, so we append a unique
989 number to the end of the name in case of conflicts.
990 """
991
992
993 if self._cur_qualifier_key:
994 key = self._cur_qualifier_key
995 value = "".join(self._cur_qualifier_value)
996 if self._feature_cleaner is not None:
997 value = self._feature_cleaner.clean_value(key, value)
998
999 if key in self._cur_feature.qualifiers:
1000 self._cur_feature.qualifiers[key].append(value)
1001
1002 else:
1003 self._cur_feature.qualifiers[key] = [value]
1004
1006 """When we get a qualifier key, use it as a dictionary key.
1007
1008 We receive a list of keys, since you can have valueless keys such as
1009 /pseudo which would be passed in with the next key (since no other
1010 tags separate them in the file)
1011 """
1012 for content in content_list:
1013
1014 self._add_qualifier()
1015
1016
1017 assert '/' not in content and '=' not in content \
1018 and content == content.strip(), content
1019
1020 self._cur_qualifier_key = content
1021 self._cur_qualifier_value = []
1022
1024
1025 qual_value = content.replace('"', '')
1026
1027 self._cur_qualifier_value.append(qual_value)
1028
1030 """Deal with CONTIG information."""
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 self.data.annotations["contig"] = content
1047
1050
1053
1056
1058 """Add up sequence information as we get it.
1059
1060 To try and make things speedier, this puts all of the strings
1061 into a list of strings, and then uses string.join later to put
1062 them together. Supposedly, this is a big time savings
1063 """
1064 assert ' ' not in content
1065 self._seq_data.append(content.upper())
1066
1134
1136 """Create a GenBank Record object from scanner generated information.
1137 """
1147
1148 - def wgs(self, content):
1150
1153
1154 - def locus(self, content):
1156
1157 - def size(self, content):
1159
1162
1165
1166 - def date(self, content):
1168
1171
1176
1177 - def nid(self, content):
1179
1180 - def pid(self, content):
1182
1185
1188
1189 - def gi(self, content):
1191
1194
1197
1200
1203
1206
1209
1212
1214 """Grab the reference number and signal the start of a new reference.
1215 """
1216
1217 if self._cur_reference is not None:
1218 self.data.references.append(self._cur_reference)
1219
1220 import Record
1221 self._cur_reference = Record.Reference()
1222 self._cur_reference.number = content
1223
1225 self._cur_reference.bases = content
1226
1228 self._cur_reference.authors = content
1229
1231 self._cur_reference.consrtm = content
1232
1233 - def title(self, content):
1241
1243 self._cur_reference.journal = content
1244
1247
1250
1252 self._cur_reference.remark = content
1253
1256
1260
1263
1265 """Get ready for the feature table when we reach the FEATURE line.
1266 """
1267 self.start_feature_table()
1268
1270 """Signal the start of the feature table.
1271 """
1272
1273 if self._cur_reference is not None:
1274 self.data.references.append(self._cur_reference)
1275
1277 """Grab the key of the feature and signal the start of a new feature.
1278 """
1279
1280 self._add_feature()
1281
1282 import Record
1283 self._cur_feature = Record.Feature()
1284 self._cur_feature.key = content
1285
1287 """Utility function to add a feature to the Record.
1288
1289 This does all of the appropriate checking to make sure we haven't
1290 left any info behind, and that we are only adding info if it
1291 exists.
1292 """
1293 if self._cur_feature is not None:
1294
1295
1296 if self._cur_qualifier is not None:
1297 self._cur_feature.qualifiers.append(self._cur_qualifier)
1298
1299 self._cur_qualifier = None
1300 self.data.features.append(self._cur_feature)
1301
1304
1306 """Deal with qualifier names
1307
1308 We receive a list of keys, since you can have valueless keys such as
1309 /pseudo which would be passed in with the next key (since no other
1310 tags separate them in the file)
1311 """
1312 import Record
1313 for content in content_list:
1314
1315 if content.find("/") != 0:
1316 content = "/%s" % content
1317
1318 if self._cur_qualifier is not None:
1319 self._cur_feature.qualifiers.append(self._cur_qualifier)
1320
1321 self._cur_qualifier = Record.Qualifier()
1322 self._cur_qualifier.key = content
1323
1325
1326 if self._cur_qualifier.key.find("=") == -1:
1327 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key
1328 cur_content = self._remove_newlines(content)
1329
1330
1331 for remove_space_key in self.__class__.remove_space_keys:
1332 if self._cur_qualifier.key.find(remove_space_key) >= 0:
1333 cur_content = self._remove_spaces(cur_content)
1334 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1335
1337 self.data.base_counts = content
1338
1340 self.data.origin = content
1341
1343 """Signal that we have contig information to add to the record.
1344 """
1345 self.data.contig = self._clean_location(content)
1346
1348 """Add sequence information to a list of sequence strings.
1349
1350 This removes spaces in the data and uppercases the sequence, and
1351 then adds it to a list of sequences. Later on we'll join this
1352 list together to make the final sequence. This is faster than
1353 adding on the new string every time.
1354 """
1355 assert ' ' not in content
1356 self._seq_data.append(content.upper())
1357
1359 """Signal the end of the record and do any necessary clean-up.
1360 """
1361
1362
1363 self.data.sequence = "".join(self._seq_data)
1364
1365 self._add_feature()
1366
1367
1369 """Run the Bio.GenBank module's doctests."""
1370 print "Runing doctests..."
1371 import doctest
1372 doctest.testmod()
1373 print "Done"
1374
1375 if __name__ == "__main__":
1376 _test()
1377