1
2
3
4
5
6
7
8 """Code for calling standalone BLAST and parsing plain text output (OBSOLETE).
9
10 Rather than parsing the human readable plain text BLAST output (which seems to
11 change with every update to BLAST), we and the NBCI recommend you parse the
12 XML output instead. The plain text parser in this module still works at the
13 time of writing, but is considered obsolete and updating it to cope with the
14 latest versions of BLAST is not a priority for us.
15
16 This module also provides code to work with the "legacy" standalone version of
17 NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of
18 the same name. These functions are very limited for dealing with the output as
19 files rather than handles, for which the wrappers in Bio.Blast.Applications are
20 prefered. Furthermore, the NCBI themselves regard these command line tools as
21 "legacy", and encourage using the new BLAST+ tools instead. Biopython has
22 wrappers for these under Bio.Blast.Applications (see the tutorial).
23
24 Classes:
25 LowQualityBlastError Except that indicates low quality query sequences.
26 BlastParser Parses output from blast.
27 BlastErrorParser Parses output and tries to diagnose possible errors.
28 PSIBlastParser Parses output from psi-blast.
29 Iterator Iterates over a file of blast results.
30
31 _Scanner Scans output from standalone BLAST.
32 _BlastConsumer Consumes output from blast.
33 _PSIBlastConsumer Consumes output from psi-blast.
34 _HeaderConsumer Consumes header information.
35 _DescriptionConsumer Consumes description information.
36 _AlignmentConsumer Consumes alignment information.
37 _HSPConsumer Consumes hsp information.
38 _DatabaseReportConsumer Consumes database report information.
39 _ParametersConsumer Consumes parameters information.
40
41 Functions:
42 blastall Execute blastall (OBSOLETE).
43 blastpgp Execute blastpgp (OBSOLETE).
44 rpsblast Execute rpsblast (OBSOLETE).
45
46 For calling the BLAST command line tools, we encourage you to use the
47 command line wrappers in Bio.Blast.Applications - the three functions
48 blastall, blastpgp and rpsblast are considered to be obsolete now, and
49 are likely to be deprecated and then removed in future releases.
50 """
51
52 import warnings
53 warnings.warn("The plain text parser in this module still works at the time of writing, but is considered obsolete and updating it to cope with the latest versions of BLAST is not a priority for us.", PendingDeprecationWarning)
54
55 import os
56 import re
57
58 from Bio import File
59 from Bio.ParserSupport import *
60 from Bio.Blast import Record
61 from Bio.Application import _escape_filename
62
64 """Error caused by running a low quality sequence through BLAST.
65
66 When low quality sequences (like GenBank entries containing only
67 stretches of a single nucleotide) are BLASTed, they will result in
68 BLAST generating an error and not being able to perform the BLAST.
69 search. This error should be raised for the BLAST reports produced
70 in this case.
71 """
72 pass
73
75 """Error caused by running a short query sequence through BLAST.
76
77 If the query sequence is too short, BLAST outputs warnings and errors:
78 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
79 [blastall] ERROR: [000.000] AT1G08320: Blast:
80 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
81 done
82
83 This exception is raised when that condition is detected.
84
85 """
86 pass
87
88
90 """Scan BLAST output from blastall or blastpgp.
91
92 Tested with blastall and blastpgp v2.0.10, v2.0.11
93
94 Methods:
95 feed Feed data into the scanner.
96
97 """
98 - def feed(self, handle, consumer):
118
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163 consumer.start_header()
164
165 read_and_call(uhandle, consumer.version, contains='BLAST')
166 read_and_call_while(uhandle, consumer.noevent, blank=1)
167
168
169 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
170
171
172 while attempt_read_and_call(uhandle,
173 consumer.reference, start='Reference'):
174
175
176 while 1:
177 line = uhandle.readline()
178 if is_blank_line(line):
179 consumer.noevent(line)
180 break
181 elif line.startswith("RID"):
182 break
183 else:
184
185 consumer.reference(line)
186
187
188 read_and_call_while(uhandle, consumer.noevent, blank=1)
189 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
190 read_and_call_while(uhandle, consumer.noevent, blank=1)
191
192
193
194 if attempt_read_and_call(
195 uhandle, consumer.reference, start="Reference"):
196 read_and_call_until(uhandle, consumer.reference, blank=1)
197 read_and_call_while(uhandle, consumer.noevent, blank=1)
198
199
200 if attempt_read_and_call(
201 uhandle, consumer.reference, start="Reference"):
202 read_and_call_until(uhandle, consumer.reference, blank=1)
203 read_and_call_while(uhandle, consumer.noevent, blank=1)
204
205 line = uhandle.peekline()
206 assert line.strip() != ""
207 assert not line.startswith("RID:")
208 if line.startswith("Query="):
209
210
211
212 read_and_call(uhandle, consumer.query_info, start='Query=')
213 read_and_call_until(uhandle, consumer.query_info, blank=1)
214 read_and_call_while(uhandle, consumer.noevent, blank=1)
215
216
217 read_and_call_until(uhandle, consumer.database_info, end='total letters')
218 read_and_call(uhandle, consumer.database_info, contains='sequences')
219 read_and_call_while(uhandle, consumer.noevent, blank=1)
220 elif line.startswith("Database:"):
221
222 read_and_call_until(uhandle, consumer.database_info, end='total letters')
223 read_and_call(uhandle, consumer.database_info, contains='sequences')
224 read_and_call_while(uhandle, consumer.noevent, blank=1)
225
226
227
228
229 read_and_call(uhandle, consumer.query_info, start='Query=')
230
231 while True:
232 line = uhandle.peekline()
233 if not line.strip() : break
234 if "Score E" in line : break
235
236 read_and_call(uhandle, consumer.query_info)
237 read_and_call_while(uhandle, consumer.noevent, blank=1)
238 else:
239 raise ValueError("Invalid header?")
240
241 consumer.end_header()
242
260
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284 consumer.start_descriptions()
285
286
287
288 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
289
290
291
292 if not uhandle.peekline():
293 raise ValueError("Unexpected end of blast report. " + \
294 "Looks suspiciously like a PSI-BLAST crash.")
295
296
297
298
299
300
301
302
303
304 line = uhandle.peekline()
305 if line.find("ERROR:") != -1 or line.startswith("done"):
306 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
307 read_and_call(uhandle, consumer.noevent, start="done")
308
309
310
311
312
313
314
315
316
317
318
319
320
321 read_and_call_while(uhandle, consumer.noevent, blank=1)
322
323 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
324 read_and_call_while(uhandle, consumer.noevent, blank=1)
325
326
327
328
329
330
331
332
333 if not attempt_read_and_call(
334 uhandle, consumer.description_header,
335 has_re=re.compile(r'Score +E')):
336
337 attempt_read_and_call(uhandle, consumer.no_hits,
338 contains='No hits found')
339 try:
340 read_and_call_while(uhandle, consumer.noevent, blank=1)
341 except ValueError, err:
342 if str(err) != "Unexpected end of stream." : raise err
343
344 consumer.end_descriptions()
345
346 return
347
348
349 read_and_call(uhandle, consumer.description_header,
350 start='Sequences producing')
351
352
353 attempt_read_and_call(uhandle, consumer.model_sequences,
354 start='Sequences used in model')
355 read_and_call_while(uhandle, consumer.noevent, blank=1)
356
357
358
359
360 if safe_peekline(uhandle).startswith(" Database:"):
361 consumer.end_descriptions()
362
363 return
364
365
366
367 if not uhandle.peekline().startswith('Sequences not found'):
368 read_and_call_until(uhandle, consumer.description, blank=1)
369 read_and_call_while(uhandle, consumer.noevent, blank=1)
370
371
372
373
374
375 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
376 start='Sequences not found'):
377
378 read_and_call_while(uhandle, consumer.noevent, blank=1)
379 l = safe_peekline(uhandle)
380
381
382
383
384 if not l.startswith('CONVERGED') and l[0] != '>' \
385 and not l.startswith('QUERY'):
386 read_and_call_until(uhandle, consumer.description, blank=1)
387 read_and_call_while(uhandle, consumer.noevent, blank=1)
388
389 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
390 read_and_call_while(uhandle, consumer.noevent, blank=1)
391
392 consumer.end_descriptions()
393
413
420
437
466
472
474
475
476
477
478
479
480 read_and_call(uhandle, consumer.score, start=' Score')
481 read_and_call(uhandle, consumer.identities, start=' Identities')
482
483 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
484
485 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
486 read_and_call(uhandle, consumer.noevent, blank=1)
487
489
490
491
492
493
494
495
496
497
498 while 1:
499
500 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
501 read_and_call(uhandle, consumer.query, start='Query')
502 read_and_call(uhandle, consumer.align, start=' ')
503 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
504 try:
505 read_and_call_while(uhandle, consumer.noevent, blank=1)
506 except ValueError, err:
507 if str(err) != "Unexpected end of stream." : raise err
508
509
510
511 break
512 line = safe_peekline(uhandle)
513
514 if not (line.startswith('Query') or line.startswith(' ')):
515 break
516
539
540 - def _eof(self, uhandle):
541 try:
542 line = safe_peekline(uhandle)
543 except ValueError, err:
544 if str(err) != "Unexpected end of stream." : raise err
545 line = ""
546 return not line
547
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575 if self._eof(uhandle) : return
576
577 consumer.start_database_report()
578
579
580
581
582 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
583 read_and_call(uhandle, consumer.noevent, contains="letters")
584 read_and_call(uhandle, consumer.noevent, contains="sequences")
585 read_and_call(uhandle, consumer.noevent, start=" ")
586
587
588
589 while attempt_read_and_call(uhandle, consumer.database,
590 start=' Database'):
591
592
593
594 if not uhandle.peekline().strip() \
595 or uhandle.peekline().startswith("BLAST"):
596 consumer.end_database_report()
597 return
598
599
600 read_and_call_until(uhandle, consumer.database, start=' Posted')
601 read_and_call(uhandle, consumer.posted_date, start=' Posted')
602 read_and_call(uhandle, consumer.num_letters_in_database,
603 start=' Number of letters')
604 read_and_call(uhandle, consumer.num_sequences_in_database,
605 start=' Number of sequences')
606
607 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
608
609 line = safe_readline(uhandle)
610 uhandle.saveline(line)
611 if line.find('Lambda') != -1:
612 break
613
614 read_and_call(uhandle, consumer.noevent, start='Lambda')
615 read_and_call(uhandle, consumer.ka_params)
616
617
618 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
619
620
621 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
622
623 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
624 read_and_call(uhandle, consumer.ka_params_gap)
625
626
627
628
629 try:
630 read_and_call_while(uhandle, consumer.noevent, blank=1)
631 except ValueError, x:
632 if str(x) != "Unexpected end of stream.":
633 raise
634 consumer.end_database_report()
635
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694 if not uhandle.peekline().strip():
695 return
696
697
698
699 consumer.start_parameters()
700
701
702 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
703
704 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
705
706 attempt_read_and_call(uhandle, consumer.num_sequences,
707 start='Number of Sequences')
708 attempt_read_and_call(uhandle, consumer.num_hits,
709 start='Number of Hits')
710 attempt_read_and_call(uhandle, consumer.num_sequences,
711 start='Number of Sequences')
712 attempt_read_and_call(uhandle, consumer.num_extends,
713 start='Number of extensions')
714 attempt_read_and_call(uhandle, consumer.num_good_extends,
715 start='Number of successful')
716
717 attempt_read_and_call(uhandle, consumer.num_seqs_better_e,
718 start='Number of sequences')
719
720
721 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
722 start="Number of HSP's better"):
723
724 if attempt_read_and_call(uhandle, consumer.noevent,
725 start="Number of HSP's gapped:"):
726 read_and_call(uhandle, consumer.noevent,
727 start="Number of HSP's successfully")
728
729 attempt_read_and_call(uhandle, consumer.noevent,
730 start="Number of extra gapped extensions")
731 else:
732 read_and_call(uhandle, consumer.hsps_prelim_gapped,
733 start="Number of HSP's successfully")
734 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
735 start="Number of HSP's that")
736 read_and_call(uhandle, consumer.hsps_gapped,
737 start="Number of HSP's gapped")
738
739 elif attempt_read_and_call(uhandle, consumer.noevent,
740 start="Number of HSP's gapped"):
741 read_and_call(uhandle, consumer.noevent,
742 start="Number of HSP's successfully")
743
744
745 attempt_read_and_call(uhandle, consumer.query_length,
746 has_re=re.compile(r"[Ll]ength of query"))
747
748 attempt_read_and_call(uhandle, consumer.database_length,
749 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
750
751
752 attempt_read_and_call(uhandle, consumer.noevent,
753 start="Length adjustment")
754 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
755 start='effective HSP')
756
757 attempt_read_and_call(
758 uhandle, consumer.effective_query_length,
759 has_re=re.compile(r'[Ee]ffective length of query'))
760
761
762 attempt_read_and_call(
763 uhandle, consumer.effective_database_length,
764 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
765
766
767 attempt_read_and_call(
768 uhandle, consumer.effective_search_space,
769 has_re=re.compile(r'[Ee]ffective search space:'))
770
771 attempt_read_and_call(
772 uhandle, consumer.effective_search_space_used,
773 has_re=re.compile(r'[Ee]ffective search space used'))
774
775
776 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
777
778
779 attempt_read_and_call(uhandle, consumer.threshold, start='T')
780
781 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
782
783
784 attempt_read_and_call(uhandle, consumer.window_size, start='A')
785
786 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
787
788
789 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
790
791 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
792
793
794 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
795 start='X3')
796
797
798 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1')
799
800
801
802 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
803 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
804
805 consumer.end_parameters()
806
808 """Parses BLAST data into a Record.Blast object.
809
810 """
815
816 - def parse(self, handle):
817 """parse(self, handle)"""
818 self._scanner.feed(handle, self._consumer)
819 return self._consumer.data
820
822 """Parses BLAST data into a Record.PSIBlast object.
823
824 """
829
830 - def parse(self, handle):
831 """parse(self, handle)"""
832 self._scanner.feed(handle, self._consumer)
833 return self._consumer.data
834
838
840 c = line.split()
841 self._header.application = c[0]
842 self._header.version = c[1]
843 if len(c) > 2:
844
845
846 self._header.date = c[2][1:-1]
847
853
868
870 line = line.rstrip()
871 if line.startswith('Database: '):
872 self._header.database = line[10:]
873 elif not line.endswith('total letters'):
874 if self._header.database:
875
876 self._header.database = self._header.database + " " + line.strip()
877 else:
878 self._header.database = line.strip()
879 else:
880 sequences, letters =_re_search(
881 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
882 "I could not find the sequences and letters in line\n%s" %line)
883 self._header.database_sequences = _safe_int(sequences)
884 self._header.database_letters = _safe_int(letters)
885
890
893 self._descriptions = []
894 self._model_sequences = []
895 self._nonmodel_sequences = []
896 self._converged = 0
897 self._type = None
898 self._roundnum = None
899
900 self.__has_n = 0
901
903 if line.startswith('Sequences producing'):
904 cols = line.split()
905 if cols[-1] == 'N':
906 self.__has_n = 1
907
909 dh = self._parse(line)
910 if self._type == 'model':
911 self._model_sequences.append(dh)
912 elif self._type == 'nonmodel':
913 self._nonmodel_sequences.append(dh)
914 else:
915 self._descriptions.append(dh)
916
919
921 self._type = 'nonmodel'
922
925
928
930 if not line.startswith('Results from round'):
931 raise ValueError("I didn't understand the round line\n%s" % line)
932 self._roundnum = _safe_int(line[18:].strip())
933
936
937 - def _parse(self, description_line):
938 line = description_line
939 dh = Record.Description()
940
941
942
943
944
945
946
947
948 cols = line.split()
949 if len(cols) < 3:
950 raise ValueError( \
951 "Line does not appear to contain description:\n%s" % line)
952 if self.__has_n:
953 i = line.rfind(cols[-1])
954 i = line.rfind(cols[-2], 0, i)
955 i = line.rfind(cols[-3], 0, i)
956 else:
957 i = line.rfind(cols[-1])
958 i = line.rfind(cols[-2], 0, i)
959 if self.__has_n:
960 dh.title, dh.score, dh.e, dh.num_alignments = \
961 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
962 else:
963 dh.title, dh.score, dh.e, dh.num_alignments = \
964 line[:i].rstrip(), cols[-2], cols[-1], 1
965 dh.num_alignments = _safe_int(dh.num_alignments)
966 dh.score = _safe_int(dh.score)
967 dh.e = _safe_float(dh.e)
968 return dh
969
971
972
973
974
978
980 if self._alignment.title:
981 self._alignment.title += " "
982 self._alignment.title += line.strip()
983
985
986 parts = line.replace(" ","").split("=")
987 assert len(parts)==2, "Unrecognised format length line"
988 self._alignment.length = parts[1]
989 self._alignment.length = _safe_int(self._alignment.length)
990
992
993 if line.startswith('QUERY') or line.startswith('blast_tmp'):
994
995
996
997
998
999 try:
1000 name, start, seq, end = line.split()
1001 except ValueError:
1002 raise ValueError("I do not understand the line\n%s" % line)
1003 self._start_index = line.index(start, len(name))
1004 self._seq_index = line.index(seq,
1005 self._start_index+len(start))
1006
1007 self._name_length = self._start_index - 1
1008 self._start_length = self._seq_index - self._start_index - 1
1009 self._seq_length = line.rfind(end) - self._seq_index - 1
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019 name = line[:self._name_length]
1020 name = name.rstrip()
1021 start = line[self._start_index:self._start_index+self._start_length]
1022 start = start.rstrip()
1023 if start:
1024 start = _safe_int(start)
1025 end = line[self._seq_index+self._seq_length:].rstrip()
1026 if end:
1027 end = _safe_int(end)
1028 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
1029
1030 if len(seq) < self._seq_length:
1031 seq = seq + ' '*(self._seq_length-len(seq))
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050 align = self._multiple_alignment.alignment
1051 align.append((name, start, seq, end))
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1101
1102 if self._alignment:
1103 self._alignment.title = self._alignment.title.rstrip()
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125 try:
1126 del self._seq_index
1127 del self._seq_length
1128 del self._start_index
1129 del self._start_length
1130 del self._name_length
1131 except AttributeError:
1132 pass
1133
1137
1139 self._hsp.bits, self._hsp.score = _re_search(
1140 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1141 "I could not find the score in line\n%s" % line)
1142 self._hsp.score = _safe_float(self._hsp.score)
1143 self._hsp.bits = _safe_float(self._hsp.bits)
1144
1145 x, y = _re_search(
1146 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1147 "I could not find the expect in line\n%s" % line)
1148 if x:
1149 self._hsp.num_alignments = _safe_int(x)
1150 else:
1151 self._hsp.num_alignments = 1
1152 self._hsp.expect = _safe_float(y)
1153
1155 x, y = _re_search(
1156 r"Identities = (\d+)\/(\d+)", line,
1157 "I could not find the identities in line\n%s" % line)
1158 self._hsp.identities = _safe_int(x), _safe_int(y)
1159 self._hsp.align_length = _safe_int(y)
1160
1161 if line.find('Positives') != -1:
1162 x, y = _re_search(
1163 r"Positives = (\d+)\/(\d+)", line,
1164 "I could not find the positives in line\n%s" % line)
1165 self._hsp.positives = _safe_int(x), _safe_int(y)
1166 assert self._hsp.align_length == _safe_int(y)
1167
1168 if line.find('Gaps') != -1:
1169 x, y = _re_search(
1170 r"Gaps = (\d+)\/(\d+)", line,
1171 "I could not find the gaps in line\n%s" % line)
1172 self._hsp.gaps = _safe_int(x), _safe_int(y)
1173 assert self._hsp.align_length == _safe_int(y)
1174
1175
1177 self._hsp.strand = _re_search(
1178 r"Strand = (\w+) / (\w+)", line,
1179 "I could not find the strand in line\n%s" % line)
1180
1182
1183
1184
1185 if line.find('/') != -1:
1186 self._hsp.frame = _re_search(
1187 r"Frame = ([-+][123]) / ([-+][123])", line,
1188 "I could not find the frame in line\n%s" % line)
1189 else:
1190 self._hsp.frame = _re_search(
1191 r"Frame = ([-+][123])", line,
1192 "I could not find the frame in line\n%s" % line)
1193
1194
1195
1196
1197
1198
1199 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1201 m = self._query_re.search(line)
1202 if m is None:
1203 raise ValueError("I could not find the query in line\n%s" % line)
1204
1205
1206
1207 colon, start, seq, end = m.groups()
1208 self._hsp.query = self._hsp.query + seq
1209 if self._hsp.query_start is None:
1210 self._hsp.query_start = _safe_int(start)
1211
1212
1213
1214 self._hsp.query_end = _safe_int(end)
1215
1216
1217 self._query_start_index = m.start(3)
1218 self._query_len = len(seq)
1219
1221 seq = line[self._query_start_index:].rstrip()
1222 if len(seq) < self._query_len:
1223
1224 seq = seq + ' ' * (self._query_len-len(seq))
1225 elif len(seq) < self._query_len:
1226 raise ValueError("Match is longer than the query in line\n%s" \
1227 % line)
1228 self._hsp.match = self._hsp.match + seq
1229
1230
1231
1232 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1234 m = self._sbjct_re.search(line)
1235 if m is None:
1236 raise ValueError("I could not find the sbjct in line\n%s" % line)
1237 colon, start, seq, end = m.groups()
1238
1239
1240
1241
1242 if not seq.strip():
1243 seq = ' ' * self._query_len
1244 self._hsp.sbjct = self._hsp.sbjct + seq
1245 if self._hsp.sbjct_start is None:
1246 self._hsp.sbjct_start = _safe_int(start)
1247
1248 self._hsp.sbjct_end = _safe_int(end)
1249 if len(seq) != self._query_len:
1250 raise ValueError( \
1251 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1252 % line)
1253
1254 del self._query_start_index
1255 del self._query_len
1256
1259
1261
1264
1266 m = re.search(r"Database: (.+)$", line)
1267 if m:
1268 self._dr.database_name.append(m.group(1))
1269 elif self._dr.database_name:
1270
1271 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1],
1272 line.strip())
1273
1274 - def posted_date(self, line):
1275 self._dr.posted_date.append(_re_search(
1276 r"Posted date:\s*(.+)$", line,
1277 "I could not find the posted date in line\n%s" % line))
1278
1283
1288
1292
1295
1299
1302
1306
1309
1314
1316 if line.find('1st pass') != -1:
1317 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1318 self._params.num_hits = _safe_int(x)
1319 else:
1320 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1321 self._params.num_hits = _safe_int(x)
1322
1324 if line.find('1st pass') != -1:
1325 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1326 self._params.num_sequences = _safe_int(x)
1327 else:
1328 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1329 self._params.num_sequences = _safe_int(x)
1330
1332 if line.find('1st pass') != -1:
1333 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1334 self._params.num_extends = _safe_int(x)
1335 else:
1336 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1337 self._params.num_extends = _safe_int(x)
1338
1340 if line.find('1st pass') != -1:
1341 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1342 self._params.num_good_extends = _safe_int(x)
1343 else:
1344 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1345 self._params.num_good_extends = _safe_int(x)
1346
1352
1357
1363
1369
1374
1379
1384
1390
1396
1402
1408
1414
1418
1420 if line[:2] == "T:":
1421
1422 self._params.threshold, = _get_cols(
1423 line, (1,), ncols=2, expected={0:"T:"})
1424 elif line[:28] == "Neighboring words threshold:":
1425 self._params.threshold, = _get_cols(
1426 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
1427 else:
1428 raise ValueError("Unrecognised threshold line:\n%s" % line)
1429 self._params.threshold = _safe_int(self._params.threshold)
1430
1432 if line[:2] == "A:":
1433 self._params.window_size, = _get_cols(
1434 line, (1,), ncols=2, expected={0:"A:"})
1435 elif line[:25] == "Window for multiple hits:":
1436 self._params.window_size, = _get_cols(
1437 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
1438 else:
1439 raise ValueError("Unrecognised window size line:\n%s" % line)
1440 self._params.window_size = _safe_int(self._params.window_size)
1441
1447
1453
1459
1465
1471
1474
1475
1476 -class _BlastConsumer(AbstractConsumer,
1477 _HeaderConsumer,
1478 _DescriptionConsumer,
1479 _AlignmentConsumer,
1480 _HSPConsumer,
1481 _DatabaseReportConsumer,
1482 _ParametersConsumer
1483 ):
1484
1485
1486
1487
1488
1489
1490
1491
1492
1495
1497
1498 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1499
1503
1507
1509 self.data.descriptions = self._descriptions
1510
1512 _AlignmentConsumer.end_alignment(self)
1513 if self._alignment.hsps:
1514 self.data.alignments.append(self._alignment)
1515 if self._multiple_alignment.alignment:
1516 self.data.multiple_alignment = self._multiple_alignment
1517
1519 _HSPConsumer.end_hsp(self)
1520 try:
1521 self._alignment.hsps.append(self._hsp)
1522 except AttributeError:
1523 raise ValueError("Found an HSP before an alignment")
1524
1528
1532
1533 -class _PSIBlastConsumer(AbstractConsumer,
1534 _HeaderConsumer,
1535 _DescriptionConsumer,
1536 _AlignmentConsumer,
1537 _HSPConsumer,
1538 _DatabaseReportConsumer,
1539 _ParametersConsumer
1540 ):
1543
1547
1551
1556
1558 _DescriptionConsumer.end_descriptions(self)
1559 self._round.number = self._roundnum
1560 if self._descriptions:
1561 self._round.new_seqs.extend(self._descriptions)
1562 self._round.reused_seqs.extend(self._model_sequences)
1563 self._round.new_seqs.extend(self._nonmodel_sequences)
1564 if self._converged:
1565 self.data.converged = 1
1566
1568 _AlignmentConsumer.end_alignment(self)
1569 if self._alignment.hsps:
1570 self._round.alignments.append(self._alignment)
1571 if self._multiple_alignment:
1572 self._round.multiple_alignment = self._multiple_alignment
1573
1575 _HSPConsumer.end_hsp(self)
1576 try:
1577 self._alignment.hsps.append(self._hsp)
1578 except AttributeError:
1579 raise ValueError("Found an HSP before an alignment")
1580
1584
1588
1590 """Iterates over a file of multiple BLAST results.
1591
1592 Methods:
1593 next Return the next record from the stream, or None.
1594
1595 """
1596 - def __init__(self, handle, parser=None):
1597 """__init__(self, handle, parser=None)
1598
1599 Create a new iterator. handle is a file-like object. parser
1600 is an optional Parser object to change the results into another form.
1601 If set to None, then the raw contents of the file will be returned.
1602
1603 """
1604 try:
1605 handle.readline
1606 except AttributeError:
1607 raise ValueError(
1608 "I expected a file handle or file-like object, got %s"
1609 % type(handle))
1610 self._uhandle = File.UndoHandle(handle)
1611 self._parser = parser
1612 self._header = []
1613
1615 """next(self) -> object
1616
1617 Return the next Blast record from the file. If no more records,
1618 return None.
1619
1620 """
1621 lines = []
1622 query = False
1623 while 1:
1624 line = self._uhandle.readline()
1625 if not line:
1626 break
1627
1628 if lines and (line.startswith('BLAST')
1629 or line.startswith('BLAST', 1)
1630 or line.startswith('<?xml ')):
1631 self._uhandle.saveline(line)
1632 break
1633
1634 if line.startswith("Query="):
1635 if not query:
1636 if not self._header:
1637 self._header = lines[:]
1638 query = True
1639 else:
1640
1641 self._uhandle.saveline(line)
1642 break
1643 lines.append(line)
1644
1645 if query and "BLAST" not in lines[0]:
1646
1647
1648
1649
1650
1651
1652 lines = self._header + lines
1653
1654 if not lines:
1655 return None
1656
1657 data = ''.join(lines)
1658 if self._parser is not None:
1659 return self._parser.parse(File.StringHandle(data))
1660 return data
1661
1663 return iter(self.next, None)
1664
1665 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1666 """Execute and retrieve data from standalone BLASTPALL as handles (OBSOLETE).
1667
1668 NOTE - This function is obsolete, you are encouraged to the command
1669 line wrapper Bio.Blast.Applications.BlastallCommandline instead.
1670
1671 Execute and retrieve data from blastall. blastcmd is the command
1672 used to launch the 'blastall' executable. program is the blast program
1673 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1674 to search against. infile is the path to the file containing
1675 the sequence to search with.
1676
1677 The return values are two handles, for standard output and standard error.
1678
1679 You may pass more parameters to **keywds to change the behavior of
1680 the search. Otherwise, optional values will be chosen by blastall.
1681 The Blast output is by default in XML format. Use the align_view keyword
1682 for output in a different format.
1683
1684 Scoring
1685 matrix Matrix to use.
1686 gap_open Gap open penalty.
1687 gap_extend Gap extension penalty.
1688 nuc_match Nucleotide match reward. (BLASTN)
1689 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1690 query_genetic_code Genetic code for Query.
1691 db_genetic_code Genetic code for database. (TBLAST[NX])
1692
1693 Algorithm
1694 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1695 expectation Expectation value cutoff.
1696 wordsize Word size.
1697 strands Query strands to search against database.([T]BLAST[NX])
1698 keep_hits Number of best hits from a region to keep.
1699 xdrop Dropoff value (bits) for gapped alignments.
1700 hit_extend Threshold for extending hits.
1701 region_length Length of region used to judge hits.
1702 db_length Effective database length.
1703 search_length Effective length of search space.
1704
1705 Processing
1706 filter Filter query sequence for low complexity (with SEG)? T/F
1707 believe_query Believe the query defline. T/F
1708 restrict_gi Restrict search to these GI's.
1709 nprocessors Number of processors to use.
1710 oldengine Force use of old engine T/F
1711
1712 Formatting
1713 html Produce HTML output? T/F
1714 descriptions Number of one-line descriptions.
1715 alignments Number of alignments.
1716 align_view Alignment view. Integer 0-11,
1717 passed as a string or integer.
1718 show_gi Show GI's in deflines? T/F
1719 seqalign_file seqalign file to output.
1720 outfile Output file for report. Filename to write to, if
1721 ommitted standard output is used (which you can access
1722 from the returned handles).
1723 """
1724
1725 _security_check_parameters(keywds)
1726
1727 att2param = {
1728 'matrix' : '-M',
1729 'gap_open' : '-G',
1730 'gap_extend' : '-E',
1731 'nuc_match' : '-r',
1732 'nuc_mismatch' : '-q',
1733 'query_genetic_code' : '-Q',
1734 'db_genetic_code' : '-D',
1735
1736 'gapped' : '-g',
1737 'expectation' : '-e',
1738 'wordsize' : '-W',
1739 'strands' : '-S',
1740 'keep_hits' : '-K',
1741 'xdrop' : '-X',
1742 'hit_extend' : '-f',
1743 'region_length' : '-L',
1744 'db_length' : '-z',
1745 'search_length' : '-Y',
1746
1747 'program' : '-p',
1748 'database' : '-d',
1749 'infile' : '-i',
1750 'filter' : '-F',
1751 'believe_query' : '-J',
1752 'restrict_gi' : '-l',
1753 'nprocessors' : '-a',
1754 'oldengine' : '-V',
1755
1756 'html' : '-T',
1757 'descriptions' : '-v',
1758 'alignments' : '-b',
1759 'align_view' : '-m',
1760 'show_gi' : '-I',
1761 'seqalign_file' : '-O',
1762 'outfile' : '-o',
1763 }
1764 import warnings
1765 warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastallCommandline instead.", PendingDeprecationWarning)
1766 from Applications import BlastallCommandline
1767 cline = BlastallCommandline(blastcmd)
1768 cline.set_parameter(att2param['program'], program)
1769 cline.set_parameter(att2param['database'], database)
1770 cline.set_parameter(att2param['infile'], infile)
1771 cline.set_parameter(att2param['align_view'], str(align_view))
1772 for key, value in keywds.iteritems():
1773 cline.set_parameter(att2param[key], str(value))
1774 return _invoke_blast(cline)
1775
1776
1777 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1778 """Execute and retrieve data from standalone BLASTPGP as handles (OBSOLETE).
1779
1780 NOTE - This function is obsolete, you are encouraged to the command
1781 line wrapper Bio.Blast.Applications.BlastpgpCommandline instead.
1782
1783 Execute and retrieve data from blastpgp. blastcmd is the command
1784 used to launch the 'blastpgp' executable. database is the path to the
1785 database to search against. infile is the path to the file containing
1786 the sequence to search with.
1787
1788 The return values are two handles, for standard output and standard error.
1789
1790 You may pass more parameters to **keywds to change the behavior of
1791 the search. Otherwise, optional values will be chosen by blastpgp.
1792 The Blast output is by default in XML format. Use the align_view keyword
1793 for output in a different format.
1794
1795 Scoring
1796 matrix Matrix to use.
1797 gap_open Gap open penalty.
1798 gap_extend Gap extension penalty.
1799 window_size Multiple hits window size.
1800 npasses Number of passes.
1801 passes Hits/passes. Integer 0-2.
1802
1803 Algorithm
1804 gapped Whether to do a gapped alignment. T/F
1805 expectation Expectation value cutoff.
1806 wordsize Word size.
1807 keep_hits Number of beset hits from a region to keep.
1808 xdrop Dropoff value (bits) for gapped alignments.
1809 hit_extend Threshold for extending hits.
1810 region_length Length of region used to judge hits.
1811 db_length Effective database length.
1812 search_length Effective length of search space.
1813 nbits_gapping Number of bits to trigger gapping.
1814 pseudocounts Pseudocounts constants for multiple passes.
1815 xdrop_final X dropoff for final gapped alignment.
1816 xdrop_extension Dropoff for blast extensions.
1817 model_threshold E-value threshold to include in multipass model.
1818 required_start Start of required region in query.
1819 required_end End of required region in query.
1820
1821 Processing
1822 XXX should document default values
1823 program The blast program to use. (PHI-BLAST)
1824 filter Filter query sequence for low complexity (with SEG)? T/F
1825 believe_query Believe the query defline? T/F
1826 nprocessors Number of processors to use.
1827
1828 Formatting
1829 html Produce HTML output? T/F
1830 descriptions Number of one-line descriptions.
1831 alignments Number of alignments.
1832 align_view Alignment view. Integer 0-11,
1833 passed as a string or integer.
1834 show_gi Show GI's in deflines? T/F
1835 seqalign_file seqalign file to output.
1836 align_outfile Output file for alignment.
1837 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1838 restart_infile Input file for PSI-BLAST restart.
1839 hit_infile Hit file for PHI-BLAST.
1840 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1841 align_outfile Output file for alignment. Filename to write to, if
1842 ommitted standard output is used (which you can access
1843 from the returned handles).
1844
1845 align_infile Input alignment file for PSI-BLAST restart.
1846
1847 """
1848
1849 import warnings
1850 warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastpgpCommandline instead.", PendingDeprecationWarning)
1851 _security_check_parameters(keywds)
1852
1853 att2param = {
1854 'matrix' : '-M',
1855 'gap_open' : '-G',
1856 'gap_extend' : '-E',
1857 'window_size' : '-A',
1858 'npasses' : '-j',
1859 'passes' : '-P',
1860
1861 'gapped' : '-g',
1862 'expectation' : '-e',
1863 'wordsize' : '-W',
1864 'keep_hits' : '-K',
1865 'xdrop' : '-X',
1866 'hit_extend' : '-f',
1867 'region_length' : '-L',
1868 'db_length' : '-Z',
1869 'search_length' : '-Y',
1870 'nbits_gapping' : '-N',
1871 'pseudocounts' : '-c',
1872 'xdrop_final' : '-Z',
1873 'xdrop_extension' : '-y',
1874 'model_threshold' : '-h',
1875 'required_start' : '-S',
1876 'required_end' : '-H',
1877
1878 'program' : '-p',
1879 'database' : '-d',
1880 'infile' : '-i',
1881 'filter' : '-F',
1882 'believe_query' : '-J',
1883 'nprocessors' : '-a',
1884
1885 'html' : '-T',
1886 'descriptions' : '-v',
1887 'alignments' : '-b',
1888 'align_view' : '-m',
1889 'show_gi' : '-I',
1890 'seqalign_file' : '-O',
1891 'align_outfile' : '-o',
1892 'checkpoint_outfile' : '-C',
1893 'restart_infile' : '-R',
1894 'hit_infile' : '-k',
1895 'matrix_outfile' : '-Q',
1896 'align_infile' : '-B',
1897 }
1898 from Applications import BlastpgpCommandline
1899 cline = BlastpgpCommandline(blastcmd)
1900 cline.set_parameter(att2param['database'], database)
1901 cline.set_parameter(att2param['infile'], infile)
1902 cline.set_parameter(att2param['align_view'], str(align_view))
1903 for key, value in keywds.iteritems():
1904 cline.set_parameter(att2param[key], str(value))
1905 return _invoke_blast(cline)
1906
1907
1908 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1909 """Execute and retrieve data from standalone RPS-BLAST as handles (OBSOLETE).
1910
1911 NOTE - This function is obsolete, you are encouraged to the command
1912 line wrapper Bio.Blast.Applications.RpsBlastCommandline instead.
1913
1914 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1915 command used to launch the 'rpsblast' executable. database is the path
1916 to the database to search against. infile is the path to the file
1917 containing the sequence to search with.
1918
1919 The return values are two handles, for standard output and standard error.
1920
1921 You may pass more parameters to **keywds to change the behavior of
1922 the search. Otherwise, optional values will be chosen by rpsblast.
1923
1924 Please note that this function will give XML output by default, by
1925 setting align_view to seven (i.e. command line option -m 7).
1926 You should use the NCBIXML.parse() function to read the resulting output.
1927 This is because NCBIStandalone.BlastParser() does not understand the
1928 plain text output format from rpsblast.
1929
1930 WARNING - The following text and associated parameter handling has not
1931 received extensive testing. Please report any errors we might have made...
1932
1933 Algorithm/Scoring
1934 gapped Whether to do a gapped alignment. T/F
1935 multihit 0 for multiple hit (default), 1 for single hit
1936 expectation Expectation value cutoff.
1937 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1938 0 in 'start' refers to the beginning of the sequence
1939 0 in 'stop' refers to the end of the sequence
1940 Default = 0,0
1941 xdrop Dropoff value (bits) for gapped alignments.
1942 xdrop_final X dropoff for final gapped alignment (in bits).
1943 xdrop_extension Dropoff for blast extensions (in bits).
1944 search_length Effective length of search space.
1945 nbits_gapping Number of bits to trigger gapping.
1946 protein Query sequence is protein. T/F
1947 db_length Effective database length.
1948
1949 Processing
1950 filter Filter query sequence for low complexity? T/F
1951 case_filter Use lower case filtering of FASTA sequence T/F, default F
1952 believe_query Believe the query defline. T/F
1953 nprocessors Number of processors to use.
1954 logfile Name of log file to use, default rpsblast.log
1955
1956 Formatting
1957 html Produce HTML output? T/F
1958 descriptions Number of one-line descriptions.
1959 alignments Number of alignments.
1960 align_view Alignment view. Integer 0-11,
1961 passed as a string or integer.
1962 show_gi Show GI's in deflines? T/F
1963 seqalign_file seqalign file to output.
1964 align_outfile Output file for alignment. Filename to write to, if
1965 ommitted standard output is used (which you can access
1966 from the returned handles).
1967 """
1968
1969 import warnings
1970 warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastrpsCommandline instead.", PendingDeprecationWarning)
1971 _security_check_parameters(keywds)
1972
1973 att2param = {
1974 'multihit' : '-P',
1975 'gapped' : '-g',
1976 'expectation' : '-e',
1977 'range_restriction' : '-L',
1978 'xdrop' : '-X',
1979 'xdrop_final' : '-Z',
1980 'xdrop_extension' : '-y',
1981 'search_length' : '-Y',
1982 'nbits_gapping' : '-N',
1983 'protein' : '-p',
1984 'db_length' : '-z',
1985
1986 'database' : '-d',
1987 'infile' : '-i',
1988 'filter' : '-F',
1989 'case_filter' : '-U',
1990 'believe_query' : '-J',
1991 'nprocessors' : '-a',
1992 'logfile' : '-l',
1993
1994 'html' : '-T',
1995 'descriptions' : '-v',
1996 'alignments' : '-b',
1997 'align_view' : '-m',
1998 'show_gi' : '-I',
1999 'seqalign_file' : '-O',
2000 'align_outfile' : '-o',
2001 }
2002
2003 from Applications import RpsBlastCommandline
2004 cline = RpsBlastCommandline(blastcmd)
2005 cline.set_parameter(att2param['database'], database)
2006 cline.set_parameter(att2param['infile'], infile)
2007 cline.set_parameter(att2param['align_view'], str(align_view))
2008 for key, value in keywds.iteritems():
2009 cline.set_parameter(att2param[key], str(value))
2010 return _invoke_blast(cline)
2011
2012
2014 m = re.search(regex, line)
2015 if not m:
2016 raise ValueError(error_msg)
2017 return m.groups()
2018
2019 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
2020 cols = line.split()
2021
2022
2023 if ncols is not None and len(cols) != ncols:
2024 raise ValueError("I expected %d columns (got %d) in line\n%s" \
2025 % (ncols, len(cols), line))
2026
2027
2028 for k in expected:
2029 if cols[k] != expected[k]:
2030 raise ValueError("I expected '%s' in column %d in line\n%s" \
2031 % (expected[k], k, line))
2032
2033
2034 results = []
2035 for c in cols_to_get:
2036 results.append(cols[c])
2037 return tuple(results)
2038
2039
2041 try:
2042 return int(str)
2043 except ValueError:
2044
2045
2046 str = str.replace(',', '')
2047
2048
2049 try:
2050 return int(str)
2051 except ValueError:
2052 pass
2053
2054
2055
2056
2057
2058 return int(float(str))
2059
2060
2062
2063
2064
2065
2066
2067 if str and str[0] in ['E', 'e']:
2068 str = '1' + str
2069 try:
2070 return float(str)
2071 except ValueError:
2072
2073 str = str.replace(',', '')
2074
2075 return float(str)
2076
2077
2079 """Start BLAST and returns handles for stdout and stderr (PRIVATE).
2080
2081 Expects a command line wrapper object from Bio.Blast.Applications
2082 """
2083 import subprocess, sys
2084 blast_cmd = cline.program_name
2085 if not os.path.exists(blast_cmd):
2086 raise ValueError("BLAST executable does not exist at %s" % blast_cmd)
2087
2088
2089
2090
2091 blast_process = subprocess.Popen(str(cline),
2092 stdin=subprocess.PIPE,
2093 stdout=subprocess.PIPE,
2094 stderr=subprocess.PIPE,
2095 universal_newlines=True,
2096 shell=(sys.platform!="win32"))
2097 blast_process.stdin.close()
2098 return blast_process.stdout, blast_process.stderr
2099
2100
2102 """Look for any attempt to insert a command into a parameter.
2103
2104 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd')
2105
2106 Looks for ";" or "&&" in the strings (Unix and Windows syntax
2107 for appending a command line), or ">", "<" or "|" (redirection)
2108 and if any are found raises an exception.
2109 """
2110 for key, value in param_dict.iteritems():
2111 str_value = str(value)
2112 for bad_str in [";", "&&", ">", "<", "|"]:
2113 if bad_str in str_value:
2114 raise ValueError("Rejecting suspicious argument for %s" % key)
2115
2126
2128 """Attempt to catch and diagnose BLAST errors while parsing.
2129
2130 This utilizes the BlastParser module but adds an additional layer
2131 of complexity on top of it by attempting to diagnose ValueErrors
2132 that may actually indicate problems during BLAST parsing.
2133
2134 Current BLAST problems this detects are:
2135 o LowQualityBlastError - When BLASTing really low quality sequences
2136 (ie. some GenBank entries which are just short streches of a single
2137 nucleotide), BLAST will report an error with the sequence and be
2138 unable to search with this. This will lead to a badly formatted
2139 BLAST report that the parsers choke on. The parser will convert the
2140 ValueError to a LowQualityBlastError and attempt to provide useful
2141 information.
2142
2143 """
2144 - def __init__(self, bad_report_handle = None):
2145 """Initialize a parser that tries to catch BlastErrors.
2146
2147 Arguments:
2148 o bad_report_handle - An optional argument specifying a handle
2149 where bad reports should be sent. This would allow you to save
2150 all of the bad reports to a file, for instance. If no handle
2151 is specified, the bad reports will not be saved.
2152 """
2153 self._bad_report_handle = bad_report_handle
2154
2155
2156 self._scanner = _Scanner()
2157 self._consumer = _BlastErrorConsumer()
2158
2159 - def parse(self, handle):
2160 """Parse a handle, attempting to diagnose errors.
2161 """
2162 results = handle.read()
2163
2164 try:
2165 self._scanner.feed(File.StringHandle(results), self._consumer)
2166 except ValueError, msg:
2167
2168 if self._bad_report_handle:
2169
2170 self._bad_report_handle.write(results)
2171
2172
2173 self._diagnose_error(
2174 File.StringHandle(results), self._consumer.data)
2175
2176
2177
2178 raise
2179 return self._consumer.data
2180
2182 """Attempt to diagnose an error in the passed handle.
2183
2184 Arguments:
2185 o handle - The handle potentially containing the error
2186 o data_record - The data record partially created by the consumer.
2187 """
2188 line = handle.readline()
2189
2190 while line:
2191
2192
2193
2194 if line.startswith('Searchingdone'):
2195 raise LowQualityBlastError("Blast failure occured on query: ",
2196 data_record.query)
2197 line = handle.readline()
2198