1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the WWW version of BLAST
11 provided by the NCBI.
12 http://blast.ncbi.nlm.nih.gov/
13
14 Functions:
15 qblast Do a BLAST search using the QBLAST API.
16 """
17
18 import sys
19 try:
20 from cStringIO import StringIO
21 except ImportError:
22 from StringIO import StringIO
23
24 from Bio._py3k import _as_string
25
26 -def qblast(program, database, sequence,
27 auto_format=None,composition_based_statistics=None,
28 db_genetic_code=None,endpoints=None,entrez_query='(none)',
29 expect=10.0,filter=None,gapcosts=None,genetic_code=None,
30 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
31 matrix_name=None,nucl_penalty=None,nucl_reward=None,
32 other_advanced=None,perc_ident=None,phi_pattern=None,
33 query_file=None,query_believe_defline=None,query_from=None,
34 query_to=None,searchsp_eff=None,service=None,threshold=None,
35 ungapped_alignment=None,word_size=None,
36 alignments=500,alignment_view=None,descriptions=500,
37 entrez_links_new_window=None,expect_low=None,expect_high=None,
38 format_entrez_query=None,format_object=None,format_type='XML',
39 ncbi_gi=None,results_file=None,show_overview=None, megablast=None,
40 ):
41 """Do a BLAST search using the QBLAST server at NCBI.
42
43 Supports all parameters of the qblast API for Put and Get.
44 Some useful parameters:
45 program blastn, blastp, blastx, tblastn, or tblastx (lower case)
46 database Which database to search against (e.g. "nr").
47 sequence The sequence to search.
48 ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
49 descriptions Number of descriptions to show. Def 500.
50 alignments Number of alignments to show. Def 500.
51 expect An expect value cutoff. Def 10.0.
52 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
53 filter "none" turns off filtering. Default no filtering
54 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
55 entrez_query Entrez query to limit Blast search
56 hitlist_size Number of hits to return. Default 50
57 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
58 service plain, psi, phi, rpsblast, megablast (lower case)
59
60 This function does no checking of the validity of the parameters
61 and passes the values to the server as is. More help is available at:
62 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
63
64 """
65 import urllib, urllib2
66 import time
67
68 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
69
70
71
72
73
74
75 parameters = [
76 ('AUTO_FORMAT',auto_format),
77 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
78 ('DATABASE',database),
79 ('DB_GENETIC_CODE',db_genetic_code),
80 ('ENDPOINTS',endpoints),
81 ('ENTREZ_QUERY',entrez_query),
82 ('EXPECT',expect),
83 ('FILTER',filter),
84 ('GAPCOSTS',gapcosts),
85 ('GENETIC_CODE',genetic_code),
86 ('HITLIST_SIZE',hitlist_size),
87 ('I_THRESH',i_thresh),
88 ('LAYOUT',layout),
89 ('LCASE_MASK',lcase_mask),
90 ('MEGABLAST',megablast),
91 ('MATRIX_NAME',matrix_name),
92 ('NUCL_PENALTY',nucl_penalty),
93 ('NUCL_REWARD',nucl_reward),
94 ('OTHER_ADVANCED',other_advanced),
95 ('PERC_IDENT',perc_ident),
96 ('PHI_PATTERN',phi_pattern),
97 ('PROGRAM',program),
98
99 ('QUERY',sequence),
100 ('QUERY_FILE',query_file),
101 ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
102 ('QUERY_FROM',query_from),
103 ('QUERY_TO',query_to),
104
105 ('SEARCHSP_EFF',searchsp_eff),
106 ('SERVICE',service),
107 ('THRESHOLD',threshold),
108 ('UNGAPPED_ALIGNMENT',ungapped_alignment),
109 ('WORD_SIZE',word_size),
110 ('CMD', 'Put'),
111 ]
112 query = [x for x in parameters if x[1] is not None]
113 message = urllib.urlencode(query)
114
115
116
117
118
119 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
120 message,
121 {"User-Agent":"BiopythonClient"})
122 handle = urllib2.urlopen(request)
123
124
125
126 rid, rtoe = _parse_qblast_ref_page(handle)
127 parameters = [
128 ('ALIGNMENTS',alignments),
129 ('ALIGNMENT_VIEW',alignment_view),
130 ('DESCRIPTIONS',descriptions),
131 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
132 ('EXPECT_LOW',expect_low),
133 ('EXPECT_HIGH',expect_high),
134 ('FORMAT_ENTREZ_QUERY',format_entrez_query),
135 ('FORMAT_OBJECT',format_object),
136 ('FORMAT_TYPE',format_type),
137 ('NCBI_GI',ncbi_gi),
138 ('RID',rid),
139 ('RESULTS_FILE',results_file),
140 ('SERVICE',service),
141 ('SHOW_OVERVIEW',show_overview),
142 ('CMD', 'Get'),
143 ]
144 query = [x for x in parameters if x[1] is not None]
145 message = urllib.urlencode(query)
146
147
148 delay = 3.0
149 previous = time.time()
150 while True:
151 current = time.time()
152 wait = previous + delay - current
153 if wait > 0:
154 time.sleep(wait)
155 previous = current + wait
156 else:
157 previous = current
158
159 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
160 message,
161 {"User-Agent":"BiopythonClient"})
162 handle = urllib2.urlopen(request)
163 results = _as_string(handle.read())
164
165
166
167 if results=="\n\n":
168 continue
169
170 if results.find("Status=") < 0:
171 break
172 i = results.index("Status=")
173 j = results.index("\n", i)
174 status = results[i+len("Status="):j].strip()
175 if status.upper() == "READY":
176 break
177
178 return StringIO(results)
179
181 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
182
183 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably
184 'Request Time of Execution' and RID would be 'Request Identifier'.
185 """
186 s = _as_string(handle.read())
187 i = s.find("RID =")
188 if i == -1:
189 rid = None
190 else:
191 j = s.find("\n", i)
192 rid = s[i+len("RID ="):j].strip()
193
194 i = s.find("RTOE =")
195 if i == -1:
196 rtoe = None
197 else:
198 j = s.find("\n", i)
199 rtoe = s[i+len("RTOE ="):j].strip()
200
201 if not rid and not rtoe:
202
203
204
205
206
207
208
209 i = s.find('<div class="error msInf">')
210 if i != -1:
211 msg = s[i+len('<div class="error msInf">'):].strip()
212 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip()
213 if msg:
214 raise ValueError("Error message from NCBI: %s" % msg)
215
216 i = s.find('<p class="error">')
217 if i != -1:
218 msg = s[i+len('<p class="error">'):].strip()
219 msg = msg.split("</p>",1)[0].split("\n",1)[0].strip()
220 if msg:
221 raise ValueError("Error message from NCBI: %s" % msg)
222
223 i = s.find('Message ID#')
224 if i != -1:
225
226 msg = s[i:].split("<",1)[0].split("\n",1)[0].strip()
227 raise ValueError("Error message from NCBI: %s" % msg)
228
229
230 raise ValueError("No RID and no RTOE found in the 'please wait' page, "
231 "there was probably an error in your request but we "
232 "could not extract a helpful error message.")
233 elif not rid:
234
235 raise ValueError("No RID found in the 'please wait' page."
236 " (although RTOE = %s)" % repr(rtoe))
237 elif not rtoe:
238
239 raise ValueError("No RTOE found in the 'please wait' page."
240 " (although RID = %s)" % repr(rid))
241
242 try:
243 return rid, int(rtoe)
244 except ValueError:
245 raise ValueError("A non-integer RTOE found in " \
246 +"the 'please wait' page, %s" % repr(rtoe))
247