Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in  
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os.path 
 39  import urlparse 
 40  import urllib 
 41  import warnings 
 42  from xml.parsers import expat 
 43   
 44  # The following four classes are used to add a member .attributes to integers, 
 45  # strings, lists, and dictionaries, respectively. 
 46   
47 -class IntegerElement(int):
48 - def __repr__(self):
49 text = int.__repr__(self) 50 try: 51 attributes = self.attributes 52 except AttributeError: 53 return text 54 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
55
56 -class StringElement(str):
57 - def __repr__(self):
58 text = str.__repr__(self) 59 try: 60 attributes = self.attributes 61 except AttributeError: 62 return text 63 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
64
65 -class UnicodeElement(unicode):
66 - def __repr__(self):
67 text = unicode.__repr__(self) 68 try: 69 attributes = self.attributes 70 except AttributeError: 71 return text 72 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
73
74 -class ListElement(list):
75 - def __repr__(self):
76 text = list.__repr__(self) 77 try: 78 attributes = self.attributes 79 except AttributeError: 80 return text 81 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
82
83 -class DictionaryElement(dict):
84 - def __repr__(self):
85 text = dict.__repr__(self) 86 try: 87 attributes = self.attributes 88 except AttributeError: 89 return text 90 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
91 92 # A StructureElement is like a dictionary, but some of its keys can have 93 # multiple values associated with it. These values are stored in a list 94 # under each key.
95 -class StructureElement(dict):
96 - def __init__(self, keys):
97 dict.__init__(self) 98 for key in keys: 99 dict.__setitem__(self, key, []) 100 self.listkeys = keys
101 - def __setitem__(self, key, value):
102 if key in self.listkeys: 103 self[key].append(value) 104 else: 105 dict.__setitem__(self, key, value)
106 - def __repr__(self):
107 text = dict.__repr__(self) 108 try: 109 attributes = self.attributes 110 except AttributeError: 111 return text 112 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
113 114
115 -class NotXMLError(ValueError):
116 - def __init__(self, message):
117 self.msg = message
118 - def __str__(self):
119 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
120 121
122 -class CorruptedXMLError(ValueError):
123 - def __init__(self, message):
124 self.msg = message
125 - def __str__(self):
126 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
127 128
129 -class ValidationError(ValueError):
130 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
131 - def __init__(self, name):
132 self.name = name
133 - def __str__(self):
134 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
135 136
137 -class DataHandler(object):
138 139 home = os.path.expanduser('~') 140 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs') 141 del home 142 143 from Bio import Entrez 144 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 145 del Entrez 146
147 - def __init__(self, validate):
148 self.stack = [] 149 self.errors = [] 150 self.integers = [] 151 self.strings = [] 152 self.lists = [] 153 self.dictionaries = [] 154 self.structures = {} 155 self.items = [] 156 self.dtd_urls = [] 157 self.validating = validate 158 self.parser = expat.ParserCreate(namespace_separator=" ") 159 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 160 self.parser.XmlDeclHandler = self.xmlDeclHandler
161
162 - def read(self, handle):
163 """Set up the parser and let it parse the XML results""" 164 try: 165 self.parser.ParseFile(handle) 166 except expat.ExpatError, e: 167 if self.parser.StartElementHandler: 168 # We saw the initial <!xml declaration, so we can be sure that 169 # we are parsing XML data. Most likely, the XML file is 170 # corrupted. 171 raise CorruptedXMLError(e) 172 else: 173 # We have not seen the initial <!xml declaration, so probably 174 # the input data is not in XML format. 175 raise NotXMLError(e) 176 try: 177 return self.object 178 except AttributeError: 179 if self.parser.StartElementHandler: 180 # We saw the initial <!xml declaration, and expat didn't notice 181 # any errors, so self.object should be defined. If not, this is 182 # a bug. 183 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 184 else: 185 # We did not see the initial <!xml declaration, so probably 186 # the input data is not in XML format. 187 raise NotXMLError("XML declaration not found")
188
189 - def parse(self, handle):
190 BLOCK = 1024 191 while True: 192 #Read in another block of the file... 193 text = handle.read(BLOCK) 194 if not text: 195 # We have reached the end of the XML file 196 if self.stack: 197 # No more XML data, but there is still some unfinished 198 # business 199 raise CorruptedXMLError 200 try: 201 for record in self.object: 202 yield record 203 except AttributeError: 204 if self.parser.StartElementHandler: 205 # We saw the initial <!xml declaration, and expat 206 # didn't notice any errors, so self.object should be 207 # defined. If not, this is a bug. 208 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 209 else: 210 # We did not see the initial <!xml declaration, so 211 # probably the input data is not in XML format. 212 raise NotXMLError("XML declaration not found") 213 self.parser.Parse("", True) 214 self.parser = None 215 return 216 217 try: 218 self.parser.Parse(text, False) 219 except expat.ExpatError, e: 220 if self.parser.StartElementHandler: 221 # We saw the initial <!xml declaration, so we can be sure 222 # that we are parsing XML data. Most likely, the XML file 223 # is corrupted. 224 raise CorruptedXMLError(e) 225 else: 226 # We have not seen the initial <!xml declaration, so 227 # probably the input data is not in XML format. 228 raise NotXMLError(e) 229 230 if not self.stack: 231 # Haven't read enough from the XML file yet 232 continue 233 234 records = self.stack[0] 235 if not isinstance(records, list): 236 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 237 while len(records) > 1: # Then the top record is finished 238 record = records.pop(0) 239 yield record
240
241 - def xmlDeclHandler(self, version, encoding, standalone):
242 # XML declaration found; set the handlers 243 self.parser.StartElementHandler = self.startElementHandler 244 self.parser.EndElementHandler = self.endElementHandler 245 self.parser.CharacterDataHandler = self.characterDataHandler 246 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 247 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
248
249 - def startNamespaceDeclHandler(self, prefix, un):
250 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
251
252 - def startElementHandler(self, name, attrs):
253 self.content = "" 254 if name in self.lists: 255 object = ListElement() 256 elif name in self.dictionaries: 257 object = DictionaryElement() 258 elif name in self.structures: 259 object = StructureElement(self.structures[name]) 260 elif name in self.items: # Only appears in ESummary 261 name = str(attrs["Name"]) # convert from Unicode 262 del attrs["Name"] 263 itemtype = str(attrs["Type"]) # convert from Unicode 264 del attrs["Type"] 265 if itemtype=="Structure": 266 object = DictionaryElement() 267 elif name in ("ArticleIds", "History"): 268 object = StructureElement(["pubmed", "medline"]) 269 elif itemtype=="List": 270 object = ListElement() 271 else: 272 object = StringElement() 273 object.itemname = name 274 object.itemtype = itemtype 275 elif name in self.strings + self.errors + self.integers: 276 self.attributes = attrs 277 return 278 else: 279 # Element not found in DTD 280 if self.validating: 281 raise ValidationError(name) 282 else: 283 # this will not be stored in the record 284 object = "" 285 if object!="": 286 object.tag = name 287 if attrs: 288 object.attributes = dict(attrs) 289 if len(self.stack)!=0: 290 current = self.stack[-1] 291 try: 292 current.append(object) 293 except AttributeError: 294 current[name] = object 295 self.stack.append(object)
296
297 - def endElementHandler(self, name):
298 value = self.content 299 if name in self.errors: 300 if value=="": 301 return 302 else: 303 raise RuntimeError(value) 304 elif name in self.integers: 305 value = IntegerElement(value) 306 elif name in self.strings: 307 # Convert Unicode strings to plain strings if possible 308 try: 309 value = StringElement(value) 310 except UnicodeEncodeError: 311 value = UnicodeElement(value) 312 elif name in self.items: 313 self.object = self.stack.pop() 314 if self.object.itemtype in ("List", "Structure"): 315 return 316 elif self.object.itemtype=="Integer" and value: 317 value = IntegerElement(value) 318 else: 319 # Convert Unicode strings to plain strings if possible 320 try: 321 value = StringElement(value) 322 except UnicodeEncodeError: 323 value = UnicodeElement(value) 324 name = self.object.itemname 325 else: 326 self.object = self.stack.pop() 327 return 328 value.tag = name 329 if self.attributes: 330 value.attributes = dict(self.attributes) 331 del self.attributes 332 current = self.stack[-1] 333 if current!="": 334 try: 335 current.append(value) 336 except AttributeError: 337 current[name] = value
338
339 - def characterDataHandler(self, content):
340 self.content += content
341
342 - def elementDecl(self, name, model):
343 """This callback function is called for each element declaration: 344 <!ELEMENT name (...)> 345 encountered in a DTD. The purpose of this function is to determine 346 whether this element should be regarded as a string, integer, list 347 dictionary, structure, or error.""" 348 if name.upper()=="ERROR": 349 self.errors.append(name) 350 return 351 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 352 expat.model.XML_CQUANT_REP, 353 None, ((expat.model.XML_CTYPE_NAME, 354 expat.model.XML_CQUANT_NONE, 355 'Item', 356 () 357 ), 358 ) 359 ): 360 # Special case. As far as I can tell, this only occurs in the 361 # eSummary DTD. 362 self.items.append(name) 363 return 364 # First, remove ignorable parentheses around declarations 365 while (model[0] in (expat.model.XML_CTYPE_SEQ, 366 expat.model.XML_CTYPE_CHOICE) 367 and model[1] in (expat.model.XML_CQUANT_NONE, 368 expat.model.XML_CQUANT_OPT) 369 and len(model[3])==1): 370 model = model[3][0] 371 # PCDATA declarations correspond to strings 372 if model[0] in (expat.model.XML_CTYPE_MIXED, 373 expat.model.XML_CTYPE_EMPTY): 374 self.strings.append(name) 375 return 376 # List-type elements 377 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 378 expat.model.XML_CTYPE_SEQ) and 379 model[1] in (expat.model.XML_CQUANT_PLUS, 380 expat.model.XML_CQUANT_REP)): 381 self.lists.append(name) 382 return 383 # This is the tricky case. Check which keys can occur multiple 384 # times. If only one key is possible, and it can occur multiple 385 # times, then this is a list. If more than one key is possible, 386 # but none of them can occur multiple times, then this is a 387 # dictionary. Otherwise, this is a structure. 388 # In 'single' and 'multiple', we keep track which keys can occur 389 # only once, and which can occur multiple times. 390 single = [] 391 multiple = [] 392 # The 'count' function is called recursively to make sure all the 393 # children in this model are counted. Error keys are ignored; 394 # they raise an exception in Python. 395 def count(model): 396 quantifier, name, children = model[1:] 397 if name==None: 398 if quantifier in (expat.model.XML_CQUANT_PLUS, 399 expat.model.XML_CQUANT_REP): 400 for child in children: 401 multiple.append(child[2]) 402 else: 403 for child in children: 404 count(child) 405 elif name.upper()!="ERROR": 406 if quantifier in (expat.model.XML_CQUANT_NONE, 407 expat.model.XML_CQUANT_OPT): 408 single.append(name) 409 elif quantifier in (expat.model.XML_CQUANT_PLUS, 410 expat.model.XML_CQUANT_REP): 411 multiple.append(name)
412 count(model) 413 if len(single)==0 and len(multiple)==1: 414 self.lists.append(name) 415 elif len(multiple)==0: 416 self.dictionaries.append(name) 417 else: 418 self.structures.update({name: multiple})
419
420 - def open_dtd_file(self, filename):
421 path = os.path.join(DataHandler.local_dtd_dir, filename) 422 try: 423 handle = open(path, "rb") 424 except IOError: 425 pass 426 else: 427 return handle 428 path = os.path.join(DataHandler.global_dtd_dir, filename) 429 try: 430 handle = open(path, "rb") 431 except IOError: 432 pass 433 else: 434 return handle 435 return None
436
437 - def externalEntityRefHandler(self, context, base, systemId, publicId):
438 """The purpose of this function is to load the DTD locally, instead 439 of downloading it from the URL specified in the XML. Using the local 440 DTD results in much faster parsing. If the DTD is not found locally, 441 we try to download it. If new DTDs become available from NCBI, 442 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 443 urlinfo = urlparse.urlparse(systemId) 444 #Following attribute requires Python 2.5+ 445 #if urlinfo.scheme=='http': 446 if urlinfo[0]=='http': 447 # Then this is an absolute path to the DTD. 448 url = systemId 449 elif urlinfo[0]=='': 450 # Then this is a relative path to the DTD. 451 # Look at the parent URL to find the full path. 452 try: 453 url = self.dtd_urls[-1] 454 except IndexError: 455 # Assume the default URL for DTDs if the top parent 456 # does not contain an absolute path 457 source = "http://www.ncbi.nlm.nih.gov/dtd/" 458 else: 459 source = os.path.dirname(url) 460 url = os.path.join(source, systemId) 461 self.dtd_urls.append(url) 462 # First, try to load the local version of the DTD file 463 location, filename = os.path.split(systemId) 464 handle = self.open_dtd_file(filename) 465 if not handle: 466 # DTD is not available as a local file. Try accessing it through 467 # the internet instead. 468 message = """\ 469 Unable to load DTD file %s. 470 471 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez. 472 Though most of NCBI's DTD files are included in the Biopython distribution, 473 sometimes you may find that a particular DTD file is missing. While we can 474 access the DTD file through the internet, the parser is much faster if the 475 required DTD files are available locally. 476 477 For this purpose, please download %s from 478 479 %s 480 481 and save it either in directory 482 483 %s 484 485 or in directory 486 487 %s 488 489 in order for Bio.Entrez to find it. 490 491 Alternatively, you can save %s in the directory 492 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython. 493 494 Please also inform the Biopython developers about this missing DTD, by 495 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing 496 list and emailing us, so that we can include it with the next release of 497 Biopython. 498 499 Proceeding to access the DTD file through the internet... 500 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename) 501 warnings.warn(message) 502 try: 503 handle = urllib.urlopen(url) 504 except IOError: 505 raise RuntimeException("Failed to access %s at %s" % (filename, url)) 506 507 parser = self.parser.ExternalEntityParserCreate(context) 508 parser.ElementDeclHandler = self.elementDecl 509 parser.ParseFile(handle) 510 handle.close() 511 self.dtd_urls.pop() 512 return 1
513