1
2
3
4
5
6
7
8
9 """Bio.SeqIO support for the "uniprot-xml" file format.
10
11 See also:
12
13 http://www.uniprot.org
14
15 The UniProt XML format essentially replaces the old plain text file format
16 originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
17 """
18 import sys
19
20 from Bio import Seq
21 from Bio import SeqFeature
22 from Bio import Alphabet
23 from Bio.SeqRecord import SeqRecord
24 try:
25 from cStringIO import StringIO
26 except ImportError:
27 from StringIO import StringIO
28 import warnings
29 try:
30 if (3,0,0) <= sys.version_info[:3] <= (3,1,3):
31
32 from xml.etree import ElementTree as ElementTree
33 else:
34 from xml.etree import cElementTree as ElementTree
35 except ImportError:
36 try:
37 from xml.etree import ElementTree as ElementTree
38 except ImportError:
39
40 try:
41 from lxml import etree as ElementTree
42 except ImportError:
43 try:
44 import cElementTree as ElementTree
45 except ImportError:
46 try:
47 from elementtree import ElementTree
48 except ImportError:
49 ElementTree = None
50
51
52
53
54
55
56
57
58 NS = "{http://uniprot.org/uniprot}"
59 REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
60
62 """Generator function to parse UniProt XML as SeqRecord objects.
63
64 parses an XML entry at a time from any UniProt XML file
65 returns a SeqRecord for each iteration
66
67 This generator can be used in Bio.SeqIO
68
69 return_raw_comments = True --> comment fields are returned as complete xml to allow further processing
70 skip_parsing_errors = True --> if parsing errors are found, skip to next entry
71 """
72 if isinstance(alphabet, Alphabet.NucleotideAlphabet):
73 raise ValueError, "Wrong alphabet %r" % alphabet
74 if isinstance(alphabet, Alphabet.Gapped):
75 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
76 raise ValueError, "Wrong alphabet %r" % alphabet
77
78 if not hasattr(handle, "read"):
79 if type(handle)==type(''):
80 handle=StringIO(handle)
81 else:
82 raise Exception('An XML-containing handler or an XML string must be passed')
83
84 if ElementTree is None:
85 from Bio import MissingExternalDependencyError
86 raise MissingExternalDependencyError(
87 "No ElementTree module was found. "
88 "Use Python 2.5+, lxml or elementtree if you "
89 "want to use Bio.SeqIO.UniprotIO.")
90
91 for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
92 if event=="end" and elem.tag == NS + "entry":
93 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
94 elem.clear()
95
97 """Parse a UniProt XML entry to a SeqRecord.
98
99 return_raw_comments=True to get back the complete comment field in XML format
100 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet.
101 """
103 self.entry=elem
104 self.alphabet=alphabet
105 self.return_raw_comments=return_raw_comments
106
108 """Parse the input."""
109 assert self.entry.tag == NS + 'entry'
110
111 def append_to_annotations(key, value):
112 if key not in self.ParsedSeqRecord.annotations:
113 self.ParsedSeqRecord.annotations[key]=[]
114 if value not in self.ParsedSeqRecord.annotations[key]:
115 self.ParsedSeqRecord.annotations[key].append(value)
116
117 def _parse_name(element):
118 self.ParsedSeqRecord.name=element.text
119 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
120
121 def _parse_accession(element):
122 append_to_annotations('accessions', element.text)
123 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
124
125 def _parse_protein(element):
126 """Parse protein names (PRIVATE)."""
127 descr_set=False
128 for protein_element in element.getchildren():
129 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:
130
131 for rec_name in protein_element.getchildren():
132 ann_key='%s_%s' % (protein_element.tag.replace(NS,''), rec_name.tag.replace(NS,''))
133 append_to_annotations(ann_key, rec_name.text)
134 if (rec_name.tag==NS + 'fullName') and not descr_set:
135 self.ParsedSeqRecord.description=rec_name.text
136 descr_set=True
137 elif protein_element.tag==NS + 'component':
138 pass
139 elif protein_element.tag==NS + 'domain':
140 pass
141
142 def _parse_gene(element):
143 for genename_element in element.getchildren():
144 if 'type' in genename_element.attrib:
145 ann_key='gene_%s_%s' % (genename_element.tag.replace(NS,''), genename_element.attrib['type'])
146 if genename_element.attrib['type']=='primary':
147 self.ParsedSeqRecord.annotations[ann_key]=genename_element.text
148 else:
149 append_to_annotations(ann_key,genename_element.text)
150
151 def _parse_geneLocation(element):
152 append_to_annotations('geneLocation', element.attrib['type'])
153
154 def _parse_organism(element):
155 organism_name = com_name = sci_name = ''
156 for organism_element in element.getchildren():
157 if organism_element.tag==NS + 'name':
158 if organism_element.text:
159 if organism_element.attrib['type'] == 'scientific':
160 sci_name = organism_element.text
161 elif organism_element.attrib['type'] == 'common':
162 com_name = organism_element.text
163 else:
164
165 append_to_annotations("organism_name", organism_element.text)
166 elif organism_element.tag==NS + 'dbReference':
167 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type']+':'+organism_element.attrib['id'])
168 elif organism_element.tag==NS + 'lineage':
169 for taxon_element in organism_element.getchildren():
170 if taxon_element.tag==NS + 'taxon':
171 append_to_annotations('taxonomy',taxon_element.text)
172 if sci_name and com_name:
173 organism_name = '%s (%s)' % (sci_name, com_name)
174 elif sci_name:
175 organism_name = sci_name
176 elif com_name:
177 organism_name = com_name
178 self.ParsedSeqRecord.annotations['organism']=organism_name
179
180 def _parse_organismHost(element):
181 for organism_element in element.getchildren():
182 if organism_element.tag==NS + 'name':
183 append_to_annotations("organism_host", organism_element.text)
184
185 def _parse_keyword(element):
186 append_to_annotations('keywords',element.text)
187
188 def _parse_comment(element):
189 """Parse comments (PRIVATE).
190
191 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
192 To store all the contained data, more complex data structures are needed, such as
193 annidated dictionaries. This is left to end user, by optionally setting:
194
195 return_raw_comments=True
196
197 the orginal XMLs is returned in the annotation fields.
198
199 available comment types at december 2009:
200 "allergen"
201 "alternative products"
202 "biotechnology"
203 "biophysicochemical properties"
204 "catalytic activity"
205 "caution"
206 "cofactor"
207 "developmental stage"
208 "disease"
209 "domain"
210 "disruption phenotype"
211 "enzyme regulation"
212 "function"
213 "induction"
214 "miscellaneous"
215 "pathway"
216 "pharmaceutical"
217 "polymorphism"
218 "PTM"
219 "RNA editing"
220 "similarity"
221 "subcellular location"
222 "sequence caution"
223 "subunit"
224 "tissue specificity"
225 "toxic dose"
226 "online information"
227 "mass spectrometry"
228 "interaction"
229 """
230
231 simple_comments=["allergen",
232 "biotechnology",
233 "biophysicochemical properties",
234 "catalytic activity",
235 "caution",
236 "cofactor",
237 "developmental stage",
238 "disease",
239 "domain",
240 "disruption phenotype",
241 "enzyme regulation",
242 "function",
243 "induction",
244 "miscellaneous",
245 "pathway",
246 "pharmaceutical",
247 "polymorphism",
248 "PTM",
249 "RNA editing",
250 "similarity",
251 "subunit",
252 "tissue specificity",
253 "toxic dose",
254 ]
255
256 if element.attrib['type'] in simple_comments:
257 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
258 for text_element in element.getiterator(NS + 'text'):
259 if text_element.text:
260 append_to_annotations(ann_key,text_element.text)
261 elif element.attrib['type']=='subcellular location':
262 for subloc_element in element.getiterator(NS + 'subcellularLocation'):
263 for el in subloc_element.getchildren():
264 if el.text:
265 ann_key='comment_%s_%s' % (element.attrib['type'].replace(' ',''), el.tag.replace(NS,''))
266 append_to_annotations(ann_key,el.text)
267 elif element.attrib['type']=='interaction':
268 for interact_element in element.getiterator(NS +'interactant'):
269 ann_key='comment_%s_intactId' % element.attrib['type']
270 append_to_annotations(ann_key,interact_element.attrib['intactId'])
271 elif element.attrib['type']=='alternative products':
272 for alt_element in element.getiterator(NS +'isoform'):
273 ann_key='comment_%s_isoform' % element.attrib['type'].replace(' ','')
274 for id_element in alt_element.getiterator(NS +'id'):
275 append_to_annotations(ann_key,id_element.text)
276 elif element.attrib['type']=='mass spectrometry':
277 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
278 start=end=0
279 for loc_element in element.getiterator(NS +'location'):
280 pos_els=loc_element.getiterator(NS +'position')
281 pos_els=list(pos_els)
282
283 try:
284 if pos_els:
285 end=int(pos_els[0].attrib['position'])
286 start=end-1
287 else:
288 start=int(loc_element.getiterator(NS +'begin')[0].attrib['position'])-1
289 end=int(loc_element.getiterator(NS +'end')[0].attrib['position'])
290 except :
291 pass
292 mass=element.attrib['mass']
293 method=element.attrib['mass']
294 if start==end==0:
295 append_to_annotations(ann_key,'undefined:%s|%s'%(mass,method))
296 else:
297 append_to_annotations(ann_key,'%s..%s:%s|%s'%(start,end,mass,method))
298 elif element.attrib['type']=='sequence caution':
299 pass
300 elif element.attrib['type']=='online information':
301 for link_element in element.getiterator(NS +'link'):
302 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
303 for id_element in link_element.getiterator(NS +'link'):
304 append_to_annotations(ann_key,'%s@%s'%(element.attrib['name'],link_element.attrib['uri']))
305
306
307 if self.return_raw_comments:
308 ann_key='comment_%s_xml' % element.attrib['type'].replace(' ','')
309 append_to_annotations(ann_key,ElementTree.tostring(element))
310
311
312 def _parse_dbReference(element):
313 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type']+':'+element.attrib['id'])
314
315
316
317
318
319
320 if 'type' in element.attrib:
321 if element.attrib['type'] == 'PDB':
322 method=""
323 resolution=""
324 for ref_element in element.getchildren():
325 if ref_element.tag==NS + 'property':
326 dat_type=ref_element.attrib['type']
327 if dat_type=='method':
328 method=ref_element.attrib['value']
329 if dat_type=='resolution':
330 resolution=ref_element.attrib['value']
331 if dat_type=='chains':
332 pairs=ref_element.attrib['value'].split(',')
333 for elem in pairs:
334 pair=elem.strip().split('=')
335 if pair[1]!='-':
336
337 feature=SeqFeature.SeqFeature()
338 feature.type=element.attrib['type']
339 feature.qualifiers['name']=element.attrib['id']
340 feature.qualifiers['method']=method
341 feature.qualifiers['resolution']=resolution
342 feature.qualifiers['chains']=pair[0].split('/')
343 start=int(pair[1].split('-')[0])-1
344 end=int(pair[1].split('-')[1])
345 feature.location=SeqFeature.FeatureLocation(start,end)
346
347
348 for ref_element in element.getchildren():
349 if ref_element.tag==NS + 'property':
350 pass
351
352 def _parse_reference(element):
353 reference=SeqFeature.Reference()
354 authors=[]
355 scopes=[]
356 tissues=[]
357 journal_name=''
358 pub_type=''
359 pub_date=''
360 for ref_element in element.getchildren():
361 if ref_element.tag==NS + 'citation':
362 pub_type=ref_element.attrib['type']
363 if pub_type=='submission':
364 pub_type+=' to the '+ref_element.attrib['db']
365 if 'name' in ref_element.attrib:
366 journal_name=ref_element.attrib['name']
367 pub_date=ref_element.attrib.get('date','')
368 j_volume=ref_element.attrib.get('volume','')
369 j_first=ref_element.attrib.get('first','')
370 j_last=ref_element.attrib.get('last','')
371 for cit_element in ref_element.getchildren():
372 if cit_element.tag==NS + 'title':
373 reference.title=cit_element.text
374 elif cit_element.tag==NS + 'authorList':
375 for person_element in cit_element.getchildren():
376 authors.append(person_element.attrib['name'])
377 elif cit_element.tag==NS + 'dbReference':
378 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']+':'+cit_element.attrib['id'])
379 if cit_element.attrib['type']=='PubMed':
380 reference.pubmed_id=cit_element.attrib['id']
381 elif ref_element.attrib['type']=='MEDLINE':
382 reference.medline_id=cit_element.attrib['id']
383 elif ref_element.tag==NS + 'scope':
384 scopes.append(ref_element.text)
385 elif ref_element.tag==NS + 'source':
386 for source_element in ref_element.getchildren():
387 if source_element.tag==NS + 'tissue':
388 tissues.append(source_element.text)
389 if scopes:
390 scopes_str='Scope: '+', '.join(scopes)
391 else:
392 scopes_str=''
393 if tissues:
394 tissues_str='Tissue: '+', '.join(tissues)
395 else:
396 tissues_str=''
397
398 reference.location = []
399 reference.authors = ', '.join(authors)
400 if journal_name:
401 if pub_date and j_volume and j_first and j_last:
402 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
403 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
404 else:
405 reference.journal = journal_name
406 reference.comment = ' | '.join((pub_type,pub_date,scopes_str,tissues_str))
407 append_to_annotations('references', reference)
408
409 def _parse_position(element, offset=0):
410 try:
411 position=int(element.attrib['position']) + offset
412 except KeyError, err:
413 position=None
414 status = element.attrib.get('status', '')
415 if status == 'unknown':
416 assert position is None
417 return SeqFeature.UnknownPosition()
418 elif not status:
419 return SeqFeature.ExactPosition(position)
420 elif status == 'greater than':
421 return SeqFeature.AfterPosition(position)
422 elif status == 'less than':
423 return SeqFeature.BeforePosition(position)
424 elif status == 'uncertain':
425 return SeqFeature.UncertainPosition(position)
426 else:
427 raise NotImplementedError("Position status %r" % status)
428
429 def _parse_feature(element):
430 feature=SeqFeature.SeqFeature()
431 for k,v in element.attrib.items():
432 feature.qualifiers[k]=v
433 feature.type=element.attrib.get('type','')
434 if 'id' in element.attrib:
435 feature.id=element.attrib['id']
436 for feature_element in element.getchildren():
437 if feature_element.tag==NS + 'location':
438 position_elements=feature_element.findall(NS + 'position')
439 if position_elements:
440 element = position_elements[0]
441 start_position = _parse_position(element, -1)
442 end_position = _parse_position(element)
443 else:
444 element = feature_element.findall(NS + 'begin')[0]
445 start_position=_parse_position(element, -1)
446 element = feature_element.findall(NS + 'end')[0]
447 end_position=_parse_position(element)
448 feature.location=SeqFeature.FeatureLocation(start_position,end_position)
449 else:
450 try:
451 feature.qualifiers[feature_element.tag.replace(NS,'')]=feature_element.text
452 except:
453 pass
454 self.ParsedSeqRecord.features.append(feature)
455
456 def _parse_proteinExistence(element):
457 append_to_annotations('proteinExistence', element.attrib['type'])
458
459 def _parse_evidence(element):
460 for k, v in element.attrib.items():
461 ann_key = k
462 append_to_annotations(ann_key, v)
463
464 def _parse_sequence(element):
465 for k, v in element.attrib.items():
466 if k in ("length", "mass", "version"):
467 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v)
468 else:
469 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v
470 seq=''.join((element.text.split()))
471 self.ParsedSeqRecord.seq=Seq.Seq(seq,self.alphabet)
472
473
474
475 self.ParsedSeqRecord=SeqRecord('', id='')
476
477
478
479 self.dbname=self.entry.attrib.get('dataset', 'UnknownDataset')
480
481 for k, v in self.entry.attrib.items():
482 if k in ("version"):
483
484
485
486
487 self.ParsedSeqRecord.annotations[k] = int(v)
488 else:
489
490 self.ParsedSeqRecord.annotations[k] = v
491
492
493 for element in self.entry.getchildren():
494 if element.tag==NS + 'name':
495 _parse_name(element)
496 elif element.tag==NS + 'accession':
497 _parse_accession(element)
498 elif element.tag==NS + 'protein':
499 _parse_protein(element)
500 elif element.tag==NS + 'gene':
501 _parse_gene(element)
502 elif element.tag==NS + 'geneLocation':
503 _parse_geneLocation(element)
504 elif element.tag==NS + 'organism':
505 _parse_organism(element)
506 elif element.tag==NS + 'organismHost':
507 _parse_organismHost(element)
508 elif element.tag==NS + 'keyword':
509 _parse_keyword(element)
510 elif element.tag==NS + 'comment':
511 _parse_comment(element)
512 elif element.tag==NS + 'dbReference':
513 _parse_dbReference(element)
514 elif element.tag==NS + 'reference':
515 _parse_reference(element)
516 elif element.tag==NS + 'feature':
517 _parse_feature(element)
518 elif element.tag==NS + 'proteinExistence':
519 _parse_proteinExistence(element)
520 elif element.tag==NS + 'evidence':
521 _parse_evidence(element)
522 elif element.tag==NS + 'sequence':
523 _parse_sequence(element)
524 else:
525 pass
526
527 self.ParsedSeqRecord.dbxrefs=list(set(self.ParsedSeqRecord.dbxrefs))
528 self.ParsedSeqRecord.dbxrefs.sort()
529
530
531 if not self.ParsedSeqRecord.id:
532 self.ParsedSeqRecord.id=self.ParsedSeqRecord.annotations['accessions'][0]
533
534 return self.ParsedSeqRecord
535