1
2
3
4
5
6 """Parser for XML results returned by NCBI's Entrez Utilities. This
7 parser is used by the read() function in Bio.Entrez, and is not intended
8 be used directly.
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 import os.path
39 import urlparse
40 import urllib
41 import warnings
42 from xml.parsers import expat
43
44
45
46
49 text = int.__repr__(self)
50 try:
51 attributes = self.attributes
52 except AttributeError:
53 return text
54 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
55
58 text = str.__repr__(self)
59 try:
60 attributes = self.attributes
61 except AttributeError:
62 return text
63 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
64
67 text = unicode.__repr__(self)
68 try:
69 attributes = self.attributes
70 except AttributeError:
71 return text
72 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
73
76 text = list.__repr__(self)
77 try:
78 attributes = self.attributes
79 except AttributeError:
80 return text
81 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
82
85 text = dict.__repr__(self)
86 try:
87 attributes = self.attributes
88 except AttributeError:
89 return text
90 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
91
92
93
94
107 text = dict.__repr__(self)
108 try:
109 attributes = self.attributes
110 except AttributeError:
111 return text
112 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
113
114
119 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
120
121
126 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
127
128
130 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
134 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
135
136
138
139 home = os.path.expanduser('~')
140 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs')
141 del home
142
143 from Bio import Entrez
144 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs")
145 del Entrez
146
148 self.stack = []
149 self.errors = []
150 self.integers = []
151 self.strings = []
152 self.lists = []
153 self.dictionaries = []
154 self.structures = {}
155 self.items = []
156 self.dtd_urls = []
157 self.validating = validate
158 self.parser = expat.ParserCreate(namespace_separator=" ")
159 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
160 self.parser.XmlDeclHandler = self.xmlDeclHandler
161
162 - def read(self, handle):
163 """Set up the parser and let it parse the XML results"""
164 try:
165 self.parser.ParseFile(handle)
166 except expat.ExpatError, e:
167 if self.parser.StartElementHandler:
168
169
170
171 raise CorruptedXMLError(e)
172 else:
173
174
175 raise NotXMLError(e)
176 try:
177 return self.object
178 except AttributeError:
179 if self.parser.StartElementHandler:
180
181
182
183 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
184 else:
185
186
187 raise NotXMLError("XML declaration not found")
188
189 - def parse(self, handle):
190 BLOCK = 1024
191 while True:
192
193 text = handle.read(BLOCK)
194 if not text:
195
196 if self.stack:
197
198
199 raise CorruptedXMLError
200 try:
201 for record in self.object:
202 yield record
203 except AttributeError:
204 if self.parser.StartElementHandler:
205
206
207
208 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
209 else:
210
211
212 raise NotXMLError("XML declaration not found")
213 self.parser.Parse("", True)
214 self.parser = None
215 return
216
217 try:
218 self.parser.Parse(text, False)
219 except expat.ExpatError, e:
220 if self.parser.StartElementHandler:
221
222
223
224 raise CorruptedXMLError(e)
225 else:
226
227
228 raise NotXMLError(e)
229
230 if not self.stack:
231
232 continue
233
234 records = self.stack[0]
235 if not isinstance(records, list):
236 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse")
237 while len(records) > 1:
238 record = records.pop(0)
239 yield record
240
248
250 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
251
253 self.content = ""
254 if name in self.lists:
255 object = ListElement()
256 elif name in self.dictionaries:
257 object = DictionaryElement()
258 elif name in self.structures:
259 object = StructureElement(self.structures[name])
260 elif name in self.items:
261 name = str(attrs["Name"])
262 del attrs["Name"]
263 itemtype = str(attrs["Type"])
264 del attrs["Type"]
265 if itemtype=="Structure":
266 object = DictionaryElement()
267 elif name in ("ArticleIds", "History"):
268 object = StructureElement(["pubmed", "medline"])
269 elif itemtype=="List":
270 object = ListElement()
271 else:
272 object = StringElement()
273 object.itemname = name
274 object.itemtype = itemtype
275 elif name in self.strings + self.errors + self.integers:
276 self.attributes = attrs
277 return
278 else:
279
280 if self.validating:
281 raise ValidationError(name)
282 else:
283
284 object = ""
285 if object!="":
286 object.tag = name
287 if attrs:
288 object.attributes = dict(attrs)
289 if len(self.stack)!=0:
290 current = self.stack[-1]
291 try:
292 current.append(object)
293 except AttributeError:
294 current[name] = object
295 self.stack.append(object)
296
338
340 self.content += content
341
343 """This callback function is called for each element declaration:
344 <!ELEMENT name (...)>
345 encountered in a DTD. The purpose of this function is to determine
346 whether this element should be regarded as a string, integer, list
347 dictionary, structure, or error."""
348 if name.upper()=="ERROR":
349 self.errors.append(name)
350 return
351 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED,
352 expat.model.XML_CQUANT_REP,
353 None, ((expat.model.XML_CTYPE_NAME,
354 expat.model.XML_CQUANT_NONE,
355 'Item',
356 ()
357 ),
358 )
359 ):
360
361
362 self.items.append(name)
363 return
364
365 while (model[0] in (expat.model.XML_CTYPE_SEQ,
366 expat.model.XML_CTYPE_CHOICE)
367 and model[1] in (expat.model.XML_CQUANT_NONE,
368 expat.model.XML_CQUANT_OPT)
369 and len(model[3])==1):
370 model = model[3][0]
371
372 if model[0] in (expat.model.XML_CTYPE_MIXED,
373 expat.model.XML_CTYPE_EMPTY):
374 self.strings.append(name)
375 return
376
377 if (model[0] in (expat.model.XML_CTYPE_CHOICE,
378 expat.model.XML_CTYPE_SEQ) and
379 model[1] in (expat.model.XML_CQUANT_PLUS,
380 expat.model.XML_CQUANT_REP)):
381 self.lists.append(name)
382 return
383
384
385
386
387
388
389
390 single = []
391 multiple = []
392
393
394
395 def count(model):
396 quantifier, name, children = model[1:]
397 if name==None:
398 if quantifier in (expat.model.XML_CQUANT_PLUS,
399 expat.model.XML_CQUANT_REP):
400 for child in children:
401 multiple.append(child[2])
402 else:
403 for child in children:
404 count(child)
405 elif name.upper()!="ERROR":
406 if quantifier in (expat.model.XML_CQUANT_NONE,
407 expat.model.XML_CQUANT_OPT):
408 single.append(name)
409 elif quantifier in (expat.model.XML_CQUANT_PLUS,
410 expat.model.XML_CQUANT_REP):
411 multiple.append(name)
412 count(model)
413 if len(single)==0 and len(multiple)==1:
414 self.lists.append(name)
415 elif len(multiple)==0:
416 self.dictionaries.append(name)
417 else:
418 self.structures.update({name: multiple})
419
436
438 """The purpose of this function is to load the DTD locally, instead
439 of downloading it from the URL specified in the XML. Using the local
440 DTD results in much faster parsing. If the DTD is not found locally,
441 we try to download it. If new DTDs become available from NCBI,
442 putting them in Bio/Entrez/DTDs will allow the parser to see them."""
443 urlinfo = urlparse.urlparse(systemId)
444
445
446 if urlinfo[0]=='http':
447
448 url = systemId
449 elif urlinfo[0]=='':
450
451
452 try:
453 url = self.dtd_urls[-1]
454 except IndexError:
455
456
457 source = "http://www.ncbi.nlm.nih.gov/dtd/"
458 else:
459 source = os.path.dirname(url)
460 url = os.path.join(source, systemId)
461 self.dtd_urls.append(url)
462
463 location, filename = os.path.split(systemId)
464 handle = self.open_dtd_file(filename)
465 if not handle:
466
467
468 message = """\
469 Unable to load DTD file %s.
470
471 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
472 Though most of NCBI's DTD files are included in the Biopython distribution,
473 sometimes you may find that a particular DTD file is missing. While we can
474 access the DTD file through the internet, the parser is much faster if the
475 required DTD files are available locally.
476
477 For this purpose, please download %s from
478
479 %s
480
481 and save it either in directory
482
483 %s
484
485 or in directory
486
487 %s
488
489 in order for Bio.Entrez to find it.
490
491 Alternatively, you can save %s in the directory
492 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.
493
494 Please also inform the Biopython developers about this missing DTD, by
495 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing
496 list and emailing us, so that we can include it with the next release of
497 Biopython.
498
499 Proceeding to access the DTD file through the internet...
500 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename)
501 warnings.warn(message)
502 try:
503 handle = urllib.urlopen(url)
504 except IOError:
505 raise RuntimeException("Failed to access %s at %s" % (filename, url))
506
507 parser = self.parser.ExternalEntityParserCreate(context)
508 parser.ElementDeclHandler = self.elementDecl
509 parser.ParseFile(handle)
510 handle.close()
511 self.dtd_urls.pop()
512 return 1
513