Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See also: 
 12   
 13  http://www.uniprot.org 
 14   
 15  The UniProt XML format essentially replaces the old plain text file format 
 16  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  try: 
 25      from cStringIO import StringIO 
 26  except ImportError: 
 27      from StringIO import StringIO 
 28  import warnings 
 29  try: 
 30      if (3,0,0) <= sys.version_info[:3] <= (3,1,3): 
 31          #workaround for bug in python 3 to 3.1.3  see http://bugs.python.org/issue9257 
 32          from xml.etree import ElementTree as ElementTree 
 33      else: 
 34          from xml.etree import cElementTree as ElementTree 
 35  except ImportError: 
 36      try: 
 37          from xml.etree import ElementTree as ElementTree 
 38      except ImportError: 
 39          # Python 2.4 -- check for 3rd-party implementations 
 40          try: 
 41              from lxml import etree as ElementTree 
 42          except ImportError: 
 43              try: 
 44                  import cElementTree as ElementTree 
 45              except ImportError: 
 46                  try: 
 47                      from elementtree import ElementTree 
 48                  except ImportError: 
 49                      ElementTree = None 
 50                      #TODO - Clean this up after we drop Python 2.4, 
 51                      #for now delay the error so the tests pass on Python 2.4 
 52                      #from Bio import MissingPythonDependencyError 
 53                      #raise MissingPythonDependencyError( 
 54                      #        "No ElementTree module was found. " 
 55                      #        "Use Python 2.5+, lxml or elementtree if you " 
 56                      #        "want to use Bio.SeqIO.UniprotIO.") 
 57   
 58  NS = "{http://uniprot.org/uniprot}" 
 59  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 60   
61 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
62 """Generator function to parse UniProt XML as SeqRecord objects. 63 64 parses an XML entry at a time from any UniProt XML file 65 returns a SeqRecord for each iteration 66 67 This generator can be used in Bio.SeqIO 68 69 return_raw_comments = True --> comment fields are returned as complete xml to allow further processing 70 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 71 """ 72 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 73 raise ValueError, "Wrong alphabet %r" % alphabet 74 if isinstance(alphabet, Alphabet.Gapped): 75 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 76 raise ValueError, "Wrong alphabet %r" % alphabet 77 78 if not hasattr(handle, "read"): 79 if type(handle)==type(''): 80 handle=StringIO(handle) 81 else: 82 raise Exception('An XML-containing handler or an XML string must be passed') 83 84 if ElementTree is None: 85 from Bio import MissingExternalDependencyError 86 raise MissingExternalDependencyError( 87 "No ElementTree module was found. " 88 "Use Python 2.5+, lxml or elementtree if you " 89 "want to use Bio.SeqIO.UniprotIO.") 90 91 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 92 if event=="end" and elem.tag == NS + "entry": 93 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 94 elem.clear()
95
96 -class Parser(object):
97 """Parse a UniProt XML entry to a SeqRecord. 98 99 return_raw_comments=True to get back the complete comment field in XML format 100 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 101 """
102 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
103 self.entry=elem 104 self.alphabet=alphabet 105 self.return_raw_comments=return_raw_comments
106
107 - def parse(self):
108 """Parse the input.""" 109 assert self.entry.tag == NS + 'entry' 110 111 def append_to_annotations(key, value): 112 if key not in self.ParsedSeqRecord.annotations: 113 self.ParsedSeqRecord.annotations[key]=[] 114 if value not in self.ParsedSeqRecord.annotations[key]: 115 self.ParsedSeqRecord.annotations[key].append(value)
116 117 def _parse_name(element): 118 self.ParsedSeqRecord.name=element.text 119 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
120 121 def _parse_accession(element): 122 append_to_annotations('accessions', element.text)# to cope with SwissProt plain text parser 123 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text) 124 125 def _parse_protein(element): 126 """Parse protein names (PRIVATE).""" 127 descr_set=False 128 for protein_element in element.getchildren(): 129 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:#recommendedName tag are parsed before 130 #use protein fields for name and description 131 for rec_name in protein_element.getchildren(): 132 ann_key='%s_%s' % (protein_element.tag.replace(NS,''), rec_name.tag.replace(NS,'')) 133 append_to_annotations(ann_key, rec_name.text) 134 if (rec_name.tag==NS + 'fullName') and not descr_set: 135 self.ParsedSeqRecord.description=rec_name.text 136 descr_set=True 137 elif protein_element.tag==NS + 'component': 138 pass #not parsed 139 elif protein_element.tag==NS + 'domain': 140 pass #not parsed 141 142 def _parse_gene(element): 143 for genename_element in element.getchildren(): 144 if 'type' in genename_element.attrib: 145 ann_key='gene_%s_%s' % (genename_element.tag.replace(NS,''), genename_element.attrib['type']) 146 if genename_element.attrib['type']=='primary': 147 self.ParsedSeqRecord.annotations[ann_key]=genename_element.text 148 else: 149 append_to_annotations(ann_key,genename_element.text) 150 151 def _parse_geneLocation(element): 152 append_to_annotations('geneLocation', element.attrib['type']) 153 154 def _parse_organism(element): 155 organism_name = com_name = sci_name = '' 156 for organism_element in element.getchildren(): 157 if organism_element.tag==NS + 'name': 158 if organism_element.text: 159 if organism_element.attrib['type'] == 'scientific': 160 sci_name = organism_element.text 161 elif organism_element.attrib['type'] == 'common': 162 com_name = organism_element.text 163 else: 164 #e.g. synonym 165 append_to_annotations("organism_name", organism_element.text) 166 elif organism_element.tag==NS + 'dbReference': 167 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type']+':'+organism_element.attrib['id']) 168 elif organism_element.tag==NS + 'lineage': 169 for taxon_element in organism_element.getchildren(): 170 if taxon_element.tag==NS + 'taxon': 171 append_to_annotations('taxonomy',taxon_element.text) 172 if sci_name and com_name: 173 organism_name = '%s (%s)' % (sci_name, com_name) 174 elif sci_name: 175 organism_name = sci_name 176 elif com_name: 177 organism_name = com_name 178 self.ParsedSeqRecord.annotations['organism']=organism_name 179 180 def _parse_organismHost(element): 181 for organism_element in element.getchildren(): 182 if organism_element.tag==NS + 'name': 183 append_to_annotations("organism_host", organism_element.text) 184 185 def _parse_keyword(element): 186 append_to_annotations('keywords',element.text) 187 188 def _parse_comment(element): 189 """Parse comments (PRIVATE). 190 191 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 192 To store all the contained data, more complex data structures are needed, such as 193 annidated dictionaries. This is left to end user, by optionally setting: 194 195 return_raw_comments=True 196 197 the orginal XMLs is returned in the annotation fields. 198 199 available comment types at december 2009: 200 "allergen" 201 "alternative products" 202 "biotechnology" 203 "biophysicochemical properties" 204 "catalytic activity" 205 "caution" 206 "cofactor" 207 "developmental stage" 208 "disease" 209 "domain" 210 "disruption phenotype" 211 "enzyme regulation" 212 "function" 213 "induction" 214 "miscellaneous" 215 "pathway" 216 "pharmaceutical" 217 "polymorphism" 218 "PTM" 219 "RNA editing" 220 "similarity" 221 "subcellular location" 222 "sequence caution" 223 "subunit" 224 "tissue specificity" 225 "toxic dose" 226 "online information" 227 "mass spectrometry" 228 "interaction" 229 """ 230 231 simple_comments=["allergen", 232 "biotechnology", 233 "biophysicochemical properties", 234 "catalytic activity", 235 "caution", 236 "cofactor", 237 "developmental stage", 238 "disease", 239 "domain", 240 "disruption phenotype", 241 "enzyme regulation", 242 "function", 243 "induction", 244 "miscellaneous", 245 "pathway", 246 "pharmaceutical", 247 "polymorphism", 248 "PTM", 249 "RNA editing",#positions not parsed 250 "similarity", 251 "subunit", 252 "tissue specificity", 253 "toxic dose", 254 ] 255 256 if element.attrib['type'] in simple_comments: 257 ann_key='comment_%s' % element.attrib['type'].replace(' ','') 258 for text_element in element.getiterator(NS + 'text'): 259 if text_element.text: 260 append_to_annotations(ann_key,text_element.text) 261 elif element.attrib['type']=='subcellular location': 262 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 263 for el in subloc_element.getchildren(): 264 if el.text: 265 ann_key='comment_%s_%s' % (element.attrib['type'].replace(' ',''), el.tag.replace(NS,'')) 266 append_to_annotations(ann_key,el.text) 267 elif element.attrib['type']=='interaction': 268 for interact_element in element.getiterator(NS +'interactant'): 269 ann_key='comment_%s_intactId' % element.attrib['type'] 270 append_to_annotations(ann_key,interact_element.attrib['intactId']) 271 elif element.attrib['type']=='alternative products': 272 for alt_element in element.getiterator(NS +'isoform'): 273 ann_key='comment_%s_isoform' % element.attrib['type'].replace(' ','') 274 for id_element in alt_element.getiterator(NS +'id'): 275 append_to_annotations(ann_key,id_element.text) 276 elif element.attrib['type']=='mass spectrometry': 277 ann_key='comment_%s' % element.attrib['type'].replace(' ','') 278 start=end=0 279 for loc_element in element.getiterator(NS +'location'): 280 pos_els=loc_element.getiterator(NS +'position') 281 pos_els=list(pos_els) 282 # this try should be avoided, maybe it is safer to skip postion parsing for mass spectrometry 283 try: 284 if pos_els: 285 end=int(pos_els[0].attrib['position']) 286 start=end-1 287 else: 288 start=int(loc_element.getiterator(NS +'begin')[0].attrib['position'])-1 289 end=int(loc_element.getiterator(NS +'end')[0].attrib['position']) 290 except :#undefined positions or erroneusly mapped 291 pass 292 mass=element.attrib['mass'] 293 method=element.attrib['mass'] #TODO - Check this, looks wrong! 294 if start==end==0: 295 append_to_annotations(ann_key,'undefined:%s|%s'%(mass,method)) 296 else: 297 append_to_annotations(ann_key,'%s..%s:%s|%s'%(start,end,mass,method)) 298 elif element.attrib['type']=='sequence caution': 299 pass#not parsed: few information, complex structure 300 elif element.attrib['type']=='online information': 301 for link_element in element.getiterator(NS +'link'): 302 ann_key='comment_%s' % element.attrib['type'].replace(' ','') 303 for id_element in link_element.getiterator(NS +'link'): 304 append_to_annotations(ann_key,'%s@%s'%(element.attrib['name'],link_element.attrib['uri'])) 305 306 #return raw XML comments if needed 307 if self.return_raw_comments: 308 ann_key='comment_%s_xml' % element.attrib['type'].replace(' ','') 309 append_to_annotations(ann_key,ElementTree.tostring(element)) 310 311 312 def _parse_dbReference(element): 313 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type']+':'+element.attrib['id']) 314 #e.g. 315 # <dbReference type="PDB" key="11" id="2GEZ"> 316 # <property value="X-ray" type="method"/> 317 # <property value="2.60 A" type="resolution"/> 318 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 319 # </dbReference> 320 if 'type' in element.attrib: 321 if element.attrib['type'] == 'PDB': 322 method="" 323 resolution="" 324 for ref_element in element.getchildren(): 325 if ref_element.tag==NS + 'property': 326 dat_type=ref_element.attrib['type'] 327 if dat_type=='method': 328 method=ref_element.attrib['value'] 329 if dat_type=='resolution': 330 resolution=ref_element.attrib['value'] 331 if dat_type=='chains': 332 pairs=ref_element.attrib['value'].split(',') 333 for elem in pairs: 334 pair=elem.strip().split('=') 335 if pair[1]!='-': 336 #TODO - How best to store these, do SeqFeatures make sense? 337 feature=SeqFeature.SeqFeature() 338 feature.type=element.attrib['type'] 339 feature.qualifiers['name']=element.attrib['id'] 340 feature.qualifiers['method']=method 341 feature.qualifiers['resolution']=resolution 342 feature.qualifiers['chains']=pair[0].split('/') 343 start=int(pair[1].split('-')[0])-1 344 end=int(pair[1].split('-')[1]) 345 feature.location=SeqFeature.FeatureLocation(start,end) 346 #self.ParsedSeqRecord.features.append(feature) 347 348 for ref_element in element.getchildren(): 349 if ref_element.tag==NS + 'property': 350 pass# this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 351 352 def _parse_reference(element): 353 reference=SeqFeature.Reference() 354 authors=[] 355 scopes=[] 356 tissues=[] 357 journal_name='' 358 pub_type='' 359 pub_date='' 360 for ref_element in element.getchildren(): 361 if ref_element.tag==NS + 'citation': 362 pub_type=ref_element.attrib['type'] 363 if pub_type=='submission': 364 pub_type+=' to the '+ref_element.attrib['db'] 365 if 'name' in ref_element.attrib: 366 journal_name=ref_element.attrib['name'] 367 pub_date=ref_element.attrib.get('date','') 368 j_volume=ref_element.attrib.get('volume','') 369 j_first=ref_element.attrib.get('first','') 370 j_last=ref_element.attrib.get('last','') 371 for cit_element in ref_element.getchildren(): 372 if cit_element.tag==NS + 'title': 373 reference.title=cit_element.text 374 elif cit_element.tag==NS + 'authorList': 375 for person_element in cit_element.getchildren(): 376 authors.append(person_element.attrib['name']) 377 elif cit_element.tag==NS + 'dbReference': 378 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']+':'+cit_element.attrib['id']) 379 if cit_element.attrib['type']=='PubMed': 380 reference.pubmed_id=cit_element.attrib['id'] 381 elif ref_element.attrib['type']=='MEDLINE': 382 reference.medline_id=cit_element.attrib['id'] 383 elif ref_element.tag==NS + 'scope': 384 scopes.append(ref_element.text) 385 elif ref_element.tag==NS + 'source': 386 for source_element in ref_element.getchildren(): 387 if source_element.tag==NS + 'tissue': 388 tissues.append(source_element.text) 389 if scopes: 390 scopes_str='Scope: '+', '.join(scopes) 391 else: 392 scopes_str='' 393 if tissues: 394 tissues_str='Tissue: '+', '.join(tissues) 395 else: 396 tissues_str='' 397 398 reference.location = [] #locations cannot be parsed since they are actually written in free text inside scopes so all the references are put in the annotation. 399 reference.authors = ', '.join(authors) 400 if journal_name: 401 if pub_date and j_volume and j_first and j_last: 402 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 403 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) 404 else: 405 reference.journal = journal_name 406 reference.comment = ' | '.join((pub_type,pub_date,scopes_str,tissues_str)) 407 append_to_annotations('references', reference) 408 409 def _parse_position(element, offset=0): 410 try: 411 position=int(element.attrib['position']) + offset 412 except KeyError, err: 413 position=None 414 status = element.attrib.get('status', '') 415 if status == 'unknown': 416 assert position is None 417 return SeqFeature.UnknownPosition() 418 elif not status: 419 return SeqFeature.ExactPosition(position) 420 elif status == 'greater than': 421 return SeqFeature.AfterPosition(position) 422 elif status == 'less than': 423 return SeqFeature.BeforePosition(position) 424 elif status == 'uncertain': 425 return SeqFeature.UncertainPosition(position) 426 else: 427 raise NotImplementedError("Position status %r" % status) 428 429 def _parse_feature(element): 430 feature=SeqFeature.SeqFeature() 431 for k,v in element.attrib.items(): 432 feature.qualifiers[k]=v 433 feature.type=element.attrib.get('type','') 434 if 'id' in element.attrib: 435 feature.id=element.attrib['id'] 436 for feature_element in element.getchildren(): 437 if feature_element.tag==NS + 'location': 438 position_elements=feature_element.findall(NS + 'position') 439 if position_elements: 440 element = position_elements[0] 441 start_position = _parse_position(element, -1) 442 end_position = _parse_position(element) 443 else: 444 element = feature_element.findall(NS + 'begin')[0] 445 start_position=_parse_position(element, -1) 446 element = feature_element.findall(NS + 'end')[0] 447 end_position=_parse_position(element) 448 feature.location=SeqFeature.FeatureLocation(start_position,end_position) 449 else: 450 try: 451 feature.qualifiers[feature_element.tag.replace(NS,'')]=feature_element.text 452 except: 453 pass#skip unparsable tag 454 self.ParsedSeqRecord.features.append(feature) 455 456 def _parse_proteinExistence(element): 457 append_to_annotations('proteinExistence', element.attrib['type']) 458 459 def _parse_evidence(element): 460 for k, v in element.attrib.items(): 461 ann_key = k 462 append_to_annotations(ann_key, v) 463 464 def _parse_sequence(element): 465 for k, v in element.attrib.items(): 466 if k in ("length", "mass", "version"): 467 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 468 else: 469 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 470 seq=''.join((element.text.split())) 471 self.ParsedSeqRecord.seq=Seq.Seq(seq,self.alphabet) 472 473 #============================================# 474 #Initialize SeqRecord 475 self.ParsedSeqRecord=SeqRecord('', id='') 476 477 #Entry attribs parsing 478 #Unknown dataset should not happen! 479 self.dbname=self.entry.attrib.get('dataset', 'UnknownDataset') 480 #add attribs to annotations 481 for k, v in self.entry.attrib.items(): 482 if k in ("version"): 483 #original 484 #self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 485 #To cope with swissProt plain text parser. this can cause errors 486 #if the attrib has the same name of an other annotation 487 self.ParsedSeqRecord.annotations[k] = int(v) 488 else: 489 #self.ParsedSeqRecord.annotations["entry_%s" % k] = v 490 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 491 492 #Top-to-bottom entry children parsing 493 for element in self.entry.getchildren(): 494 if element.tag==NS + 'name': 495 _parse_name(element) 496 elif element.tag==NS + 'accession': 497 _parse_accession(element) 498 elif element.tag==NS + 'protein': 499 _parse_protein(element) 500 elif element.tag==NS + 'gene': 501 _parse_gene(element) 502 elif element.tag==NS + 'geneLocation': 503 _parse_geneLocation(element) 504 elif element.tag==NS + 'organism': 505 _parse_organism(element) 506 elif element.tag==NS + 'organismHost': 507 _parse_organismHost(element) 508 elif element.tag==NS + 'keyword': 509 _parse_keyword(element) 510 elif element.tag==NS + 'comment': 511 _parse_comment(element) 512 elif element.tag==NS + 'dbReference': 513 _parse_dbReference(element) 514 elif element.tag==NS + 'reference': 515 _parse_reference(element) 516 elif element.tag==NS + 'feature': 517 _parse_feature(element) 518 elif element.tag==NS + 'proteinExistence': 519 _parse_proteinExistence(element) 520 elif element.tag==NS + 'evidence': 521 _parse_evidence(element) 522 elif element.tag==NS + 'sequence': 523 _parse_sequence(element) 524 else: 525 pass 526 527 self.ParsedSeqRecord.dbxrefs=list(set(self.ParsedSeqRecord.dbxrefs))#remove duplicate dbxrefs 528 self.ParsedSeqRecord.dbxrefs.sort() 529 530 # use first accession as id 531 if not self.ParsedSeqRecord.id: 532 self.ParsedSeqRecord.id=self.ParsedSeqRecord.annotations['accessions'][0] 533 534 return self.ParsedSeqRecord 535