Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """ 
 10  This module provides code to work with the WWW version of BLAST 
 11  provided by the NCBI. 
 12  http://blast.ncbi.nlm.nih.gov/ 
 13   
 14  Functions: 
 15  qblast        Do a BLAST search using the QBLAST API. 
 16  """ 
 17   
 18  import sys 
 19  try: 
 20      from cStringIO import StringIO 
 21  except ImportError: 
 22      from StringIO import StringIO 
 23   
 24  from Bio._py3k import _as_string 
 25   
26 -def qblast(program, database, sequence, 27 auto_format=None,composition_based_statistics=None, 28 db_genetic_code=None,endpoints=None,entrez_query='(none)', 29 expect=10.0,filter=None,gapcosts=None,genetic_code=None, 30 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None, 31 matrix_name=None,nucl_penalty=None,nucl_reward=None, 32 other_advanced=None,perc_ident=None,phi_pattern=None, 33 query_file=None,query_believe_defline=None,query_from=None, 34 query_to=None,searchsp_eff=None,service=None,threshold=None, 35 ungapped_alignment=None,word_size=None, 36 alignments=500,alignment_view=None,descriptions=500, 37 entrez_links_new_window=None,expect_low=None,expect_high=None, 38 format_entrez_query=None,format_object=None,format_type='XML', 39 ncbi_gi=None,results_file=None,show_overview=None, megablast=None, 40 ):
41 """Do a BLAST search using the QBLAST server at NCBI. 42 43 Supports all parameters of the qblast API for Put and Get. 44 Some useful parameters: 45 program blastn, blastp, blastx, tblastn, or tblastx (lower case) 46 database Which database to search against (e.g. "nr"). 47 sequence The sequence to search. 48 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 49 descriptions Number of descriptions to show. Def 500. 50 alignments Number of alignments to show. Def 500. 51 expect An expect value cutoff. Def 10.0. 52 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 53 filter "none" turns off filtering. Default no filtering 54 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 55 entrez_query Entrez query to limit Blast search 56 hitlist_size Number of hits to return. Default 50 57 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 58 service plain, psi, phi, rpsblast, megablast (lower case) 59 60 This function does no checking of the validity of the parameters 61 and passes the values to the server as is. More help is available at: 62 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html 63 64 """ 65 import urllib, urllib2 66 import time 67 68 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 69 70 # Format the "Put" command, which sends search requests to qblast. 71 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 72 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 73 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 74 # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 75 parameters = [ 76 ('AUTO_FORMAT',auto_format), 77 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics), 78 ('DATABASE',database), 79 ('DB_GENETIC_CODE',db_genetic_code), 80 ('ENDPOINTS',endpoints), 81 ('ENTREZ_QUERY',entrez_query), 82 ('EXPECT',expect), 83 ('FILTER',filter), 84 ('GAPCOSTS',gapcosts), 85 ('GENETIC_CODE',genetic_code), 86 ('HITLIST_SIZE',hitlist_size), 87 ('I_THRESH',i_thresh), 88 ('LAYOUT',layout), 89 ('LCASE_MASK',lcase_mask), 90 ('MEGABLAST',megablast), 91 ('MATRIX_NAME',matrix_name), 92 ('NUCL_PENALTY',nucl_penalty), 93 ('NUCL_REWARD',nucl_reward), 94 ('OTHER_ADVANCED',other_advanced), 95 ('PERC_IDENT',perc_ident), 96 ('PHI_PATTERN',phi_pattern), 97 ('PROGRAM',program), 98 #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 99 ('QUERY',sequence), 100 ('QUERY_FILE',query_file), 101 ('QUERY_BELIEVE_DEFLINE',query_believe_defline), 102 ('QUERY_FROM',query_from), 103 ('QUERY_TO',query_to), 104 #('RESULTS_FILE',...), - Can we use this parameter? 105 ('SEARCHSP_EFF',searchsp_eff), 106 ('SERVICE',service), 107 ('THRESHOLD',threshold), 108 ('UNGAPPED_ALIGNMENT',ungapped_alignment), 109 ('WORD_SIZE',word_size), 110 ('CMD', 'Put'), 111 ] 112 query = [x for x in parameters if x[1] is not None] 113 message = urllib.urlencode(query) 114 115 # Send off the initial query to qblast. 116 # Note the NCBI do not currently impose a rate limit here, other 117 # than the request not to make say 50 queries at once using multiple 118 # threads. 119 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 120 message, 121 {"User-Agent":"BiopythonClient"}) 122 handle = urllib2.urlopen(request) 123 124 # Format the "Get" command, which gets the formatted results from qblast 125 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 126 rid, rtoe = _parse_qblast_ref_page(handle) 127 parameters = [ 128 ('ALIGNMENTS',alignments), 129 ('ALIGNMENT_VIEW',alignment_view), 130 ('DESCRIPTIONS',descriptions), 131 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window), 132 ('EXPECT_LOW',expect_low), 133 ('EXPECT_HIGH',expect_high), 134 ('FORMAT_ENTREZ_QUERY',format_entrez_query), 135 ('FORMAT_OBJECT',format_object), 136 ('FORMAT_TYPE',format_type), 137 ('NCBI_GI',ncbi_gi), 138 ('RID',rid), 139 ('RESULTS_FILE',results_file), 140 ('SERVICE',service), 141 ('SHOW_OVERVIEW',show_overview), 142 ('CMD', 'Get'), 143 ] 144 query = [x for x in parameters if x[1] is not None] 145 message = urllib.urlencode(query) 146 147 # Poll NCBI until the results are ready. Use a 3 second wait 148 delay = 3.0 149 previous = time.time() 150 while True: 151 current = time.time() 152 wait = previous + delay - current 153 if wait > 0: 154 time.sleep(wait) 155 previous = current + wait 156 else: 157 previous = current 158 159 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 160 message, 161 {"User-Agent":"BiopythonClient"}) 162 handle = urllib2.urlopen(request) 163 results = _as_string(handle.read()) 164 165 # Can see an "\n\n" page while results are in progress, 166 # if so just wait a bit longer... 167 if results=="\n\n": 168 continue 169 # XML results don't have the Status tag when finished 170 if results.find("Status=") < 0: 171 break 172 i = results.index("Status=") 173 j = results.index("\n", i) 174 status = results[i+len("Status="):j].strip() 175 if status.upper() == "READY": 176 break 177 178 return StringIO(results)
179
180 -def _parse_qblast_ref_page(handle):
181 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 182 183 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably 184 'Request Time of Execution' and RID would be 'Request Identifier'. 185 """ 186 s = _as_string(handle.read()) 187 i = s.find("RID =") 188 if i == -1: 189 rid = None 190 else: 191 j = s.find("\n", i) 192 rid = s[i+len("RID ="):j].strip() 193 194 i = s.find("RTOE =") 195 if i == -1: 196 rtoe = None 197 else: 198 j = s.find("\n", i) 199 rtoe = s[i+len("RTOE ="):j].strip() 200 201 if not rid and not rtoe: 202 #Can we reliably extract the error message from the HTML page? 203 #e.g. "Message ID#24 Error: Failed to read the Blast query: 204 # Nucleotide FASTA provided for protein sequence" 205 #or "Message ID#32 Error: Query contains no data: Query 206 # contains no sequence data" 207 # 208 #This used to occur inside a <div class="error msInf"> entry: 209 i = s.find('<div class="error msInf">') 210 if i != -1: 211 msg = s[i+len('<div class="error msInf">'):].strip() 212 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip() 213 if msg: 214 raise ValueError("Error message from NCBI: %s" % msg) 215 #In spring 2010 the markup was like this: 216 i = s.find('<p class="error">') 217 if i != -1: 218 msg = s[i+len('<p class="error">'):].strip() 219 msg = msg.split("</p>",1)[0].split("\n",1)[0].strip() 220 if msg: 221 raise ValueError("Error message from NCBI: %s" % msg) 222 #Generic search based on the way the error messages start: 223 i = s.find('Message ID#') 224 if i != -1: 225 #Break the message at the first HTML tag 226 msg = s[i:].split("<",1)[0].split("\n",1)[0].strip() 227 raise ValueError("Error message from NCBI: %s" % msg) 228 #We didn't recognise the error layout :( 229 #print s 230 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 231 "there was probably an error in your request but we " 232 "could not extract a helpful error message.") 233 elif not rid: 234 #Can this happen? 235 raise ValueError("No RID found in the 'please wait' page." 236 " (although RTOE = %s)" % repr(rtoe)) 237 elif not rtoe: 238 #Can this happen? 239 raise ValueError("No RTOE found in the 'please wait' page." 240 " (although RID = %s)" % repr(rid)) 241 242 try: 243 return rid, int(rtoe) 244 except ValueError: 245 raise ValueError("A non-integer RTOE found in " \ 246 +"the 'please wait' page, %s" % repr(rtoe))
247