Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16  email        Set the Entrez email parameter (default is not set). 
 17  tool         Set the Entrez tool parameter (default is  biopython). 
 18   
 19  Functions: 
 20  efetch       Retrieves records in the requested format from a list of one or 
 21               more primary IDs or from the user's environment 
 22  epost        Posts a file containing a list of primary IDs for future use in 
 23               the user's environment to use with subsequent search strategies 
 24  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 25               and ESummary) and term translations and optionally retains 
 26               results for future use in the user's environment. 
 27  elink        Checks for the existence of an external or Related Articles link 
 28               from a list of one or more primary IDs.  Retrieves primary IDs 
 29               and relevancy scores for links to Entrez databases or Related 
 30               Articles;  creates a hyperlink to the primary LinkOut provider 
 31               for a specific ID and database, or lists LinkOut URLs 
 32               and Attributes for multiple IDs. 
 33  einfo        Provides field index term counts, last update, and available 
 34               links for each database. 
 35  esummary     Retrieves document summaries from a list of primary IDs or from 
 36               the user's environment. 
 37  egquery      Provides Entrez database counts in XML for a single search 
 38               using Global Query. 
 39  espell       Retrieves spelling suggestions. 
 40   
 41  read         Parses the XML results returned by any of the above functions. 
 42               Typical usage is: 
 43   
 44               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 45               >>> record = Entrez.read(handle) 
 46   
 47               where record is now a Python dictionary or list. 
 48   
 49  parse        Parses the XML results returned by any of the above functions, 
 50               returning records one by one. 
 51               Typical usage is: 
 52   
 53               >>> handle = Entrez.efetch(...) # or esummary, elink, ... 
 54               >>> records = Entrez.parse(handle) 
 55               >>> for record in records: 
 56               ...     # each record is a Python dictionary or list. 
 57               ...     print record 
 58   
 59               This function is appropriate only if the XML file contains 
 60               multiple records, and is particular useful for large files.  
 61   
 62  _open        Internally used function. 
 63   
 64  """ 
 65  import urllib, urllib2, time, warnings 
 66  import os.path 
 67   
 68  email = None 
 69  tool = "biopython" 
 70   
 71   
 72  # XXX retmode? 
73 -def epost(db, **keywds):
74 """Post a file of identifiers for future use. 75 76 Posts a file containing a list of UIs for future use in the user's 77 environment to use with subsequent search strategies. 78 79 See the online documentation for an explanation of the parameters: 80 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 81 82 Return a handle to the results. 83 84 Raises an IOError exception if there's a network error. 85 """ 86 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 87 variables = {'db' : db} 88 variables.update(keywds) 89 return _open(cgi, variables, post=True)
90
91 -def efetch(db, **keywds):
92 """Fetches Entrez results which are returned as a handle. 93 94 EFetch retrieves records in the requested format from a list of one or 95 more UIs or from user's environment. 96 97 See the online documentation for an explanation of the parameters: 98 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 99 100 Return a handle to the results. 101 102 Raises an IOError exception if there's a network error. 103 104 Short example: 105 106 from Bio import Entrez 107 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb") 108 print handle.read() 109 """ 110 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 111 variables = {'db' : db} 112 variables.update(keywds) 113 return _open(cgi, variables)
114
115 -def esearch(db, term, **keywds):
116 """ESearch runs an Entrez search and returns a handle to the results. 117 118 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 119 and ESummary) and term translations, and optionally retains results 120 for future use in the user's environment. 121 122 See the online documentation for an explanation of the parameters: 123 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 124 125 Return a handle to the results which are always in XML format. 126 127 Raises an IOError exception if there's a network error. 128 129 Short example: 130 131 from Bio import Entez 132 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 133 record = Entrez.read(handle) 134 print record["Count"] 135 print record["IdList"] 136 """ 137 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 138 variables = {'db' : db, 139 'term' : term} 140 variables.update(keywds) 141 return _open(cgi, variables)
142 163
164 -def einfo(**keywds):
165 """EInfo returns a summary of the Entez databases as a results handle. 166 167 EInfo provides field names, index term counts, last update, and 168 available links for each Entrez database. 169 170 See the online documentation for an explanation of the parameters: 171 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 172 173 Return a handle to the results, by default in XML format. 174 175 Raises an IOError exception if there's a network error. 176 177 Short example: 178 179 from Bio import Entrez 180 record = Entrez.read(Entrez.einfo()) 181 print record['DbList'] 182 """ 183 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 184 variables = {} 185 variables.update(keywds) 186 return _open(cgi, variables)
187
188 -def esummary(**keywds):
189 """ESummary retrieves document summaries as a results handle. 190 191 ESummary retrieves document summaries from a list of primary IDs or 192 from the user's environment. 193 194 See the online documentation for an explanation of the parameters: 195 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 196 197 Return a handle to the results, by default in XML format. 198 199 Raises an IOError exception if there's a network error. 200 """ 201 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 202 variables = {} 203 variables.update(keywds) 204 return _open(cgi, variables)
205
206 -def egquery(**keywds):
207 """EGQuery provides Entrez database counts for a global search. 208 209 EGQuery provides Entrez database counts in XML for a single search 210 using Global Query. 211 212 See the online documentation for an explanation of the parameters: 213 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 214 215 Return a handle to the results in XML format. 216 217 Raises an IOError exception if there's a network error. 218 """ 219 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 220 variables = {} 221 variables.update(keywds) 222 return _open(cgi, variables)
223
224 -def espell(**keywds):
225 """ESpell retrieves spelling suggestions, returned in a results handle. 226 227 ESpell retrieves spelling suggestions, if available. 228 229 See the online documentation for an explanation of the parameters: 230 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 231 232 Return a handle to the results, by default in XML format. 233 234 Raises an IOError exception if there's a network error. 235 236 Short example: 237 238 from Bio import Entrez 239 record = Entrez.read(Entrez.espell(term="biopythooon")) 240 print record["Query"] 241 print record["CorrectedQuery"] 242 """ 243 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 244 variables = {} 245 variables.update(keywds) 246 return _open(cgi, variables)
247
248 -def read(handle, validate=True):
249 """Parses an XML file from the NCBI Entrez Utilities into python objects. 250 251 This function parses an XML file created by NCBI's Entrez Utilities, 252 returning a multilevel data structure of Python lists and dictionaries. 253 Most XML files returned by NCBI's Entrez Utilities can be parsed by 254 this function, provided its DTD is available. Biopython includes the 255 DTDs for most commonly used Entrez Utilities. 256 257 If validate is True (default), the parser will validate the XML file 258 against the DTD, and raise an error if the XML file contains tags that 259 are not represented in the DTD. If validate is False, the parser will 260 simply skip such tags. 261 262 Whereas the data structure seems to consist of generic Python lists, 263 dictionaries, strings, and so on, each of these is actually a class 264 derived from the base type. This allows us to store the attributes 265 (if any) of each element in a dictionary my_element.attributes, and 266 the tag name in my_element.tag. 267 """ 268 from Parser import DataHandler 269 handler = DataHandler(validate) 270 record = handler.read(handle) 271 return record
272
273 -def parse(handle, validate=True):
274 """Parses an XML file from the NCBI Entrez Utilities into python objects. 275 276 This function parses an XML file created by NCBI's Entrez Utilities, 277 returning a multilevel data structure of Python lists and dictionaries. 278 This function is suitable for XML files that (in Python) can be represented 279 as a list of individual records. Whereas 'read' reads the complete file 280 and returns a single Python list, 'parse' is a generator function that 281 returns the records one by one. This function is therefore particularly 282 useful for parsing large files. 283 284 Most XML files returned by NCBI's Entrez Utilities can be parsed by 285 this function, provided its DTD is available. Biopython includes the 286 DTDs for most commonly used Entrez Utilities. 287 288 If validate is True (default), the parser will validate the XML file 289 against the DTD, and raise an error if the XML file contains tags that 290 are not represented in the DTD. If validate is False, the parser will 291 simply skip such tags. 292 293 Whereas the data structure seems to consist of generic Python lists, 294 dictionaries, strings, and so on, each of these is actually a class 295 derived from the base type. This allows us to store the attributes 296 (if any) of each element in a dictionary my_element.attributes, and 297 the tag name in my_element.tag. 298 """ 299 from Parser import DataHandler 300 handler = DataHandler(validate) 301 records = handler.parse(handle) 302 return records
303
304 -def _open(cgi, params={}, post=False):
305 """Helper function to build the URL and open a handle to it (PRIVATE). 306 307 Open a handle to Entrez. cgi is the URL for the cgi script to access. 308 params is a dictionary with the options to pass to it. Does some 309 simple error checking, and will raise an IOError if it encounters one. 310 311 This function also enforces the "up to three queries per second rule" 312 to avoid abusing the NCBI servers. 313 """ 314 # NCBI requirement: At most three queries per second. 315 # Equivalently, at least a third of second between queries 316 delay = 0.333333334 317 current = time.time() 318 wait = _open.previous + delay - current 319 if wait > 0: 320 time.sleep(wait) 321 _open.previous = current + wait 322 else: 323 _open.previous = current 324 # Remove None values from the parameters 325 for key, value in params.items(): 326 if value is None: 327 del params[key] 328 # Tell Entrez that we are using Biopython (or whatever the user has 329 # specified explicitly in the parameters or by changing the default) 330 if not "tool" in params: 331 params["tool"] = tool 332 # Tell Entrez who we are 333 if not "email" in params: 334 if email!=None: 335 params["email"] = email 336 else: 337 warnings.warn(""" 338 Email address is not specified. 339 340 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify 341 your email address with each request. From June 1, 2010, this will be 342 mandatory. As an example, if your email address is A.N.Other@example.com, you 343 can specify it as follows: 344 from Bio import Entrez 345 Entrez.email = 'A.N.Other@example.com' 346 In case of excessive usage of the E-utilities, NCBI will attempt to contact 347 a user at the email address provided before blocking access to the 348 E-utilities.""", UserWarning) 349 # Open a handle to Entrez. 350 options = urllib.urlencode(params, doseq=True) 351 try: 352 if post: 353 #HTTP POST 354 handle = urllib2.urlopen(cgi, data=options) 355 else: 356 #HTTP GET 357 cgi += "?" + options 358 handle = urllib2.urlopen(cgi) 359 except urllib2.HTTPError, exception: 360 raise exception 361 362 return handle
363 364 _open.previous = 0 365