Package Bio :: Package SeqIO :: Module SeqXmlIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SeqXmlIO

  1  # Copyright 2010 by Thomas Schmitt.   
  2  # All rights reserved. 
  3  # 
  4  # This module is for reading and writing SeqXML format files as 
  5  # SeqRecord objects, and is expected to be used via the Bio.SeqIO API. 
  6  """Bio.SeqIO support for the "seqxml" file format, SeqXML. 
  7   
  8  You are expected to use this module via the Bio.SeqIO functions. 
  9   
 10  SeqXML is a lightweight XML format which is supposed be an alternative for 
 11  FASTA files. For more Information see http://www.seqXML.org and Schmitt et al 
 12  (2011), http://dx.doi.org/10.1093/bib/bbr025  
 13  """ 
 14   
 15  from xml.sax.saxutils import XMLGenerator 
 16  from xml.sax.xmlreader import AttributesImpl 
 17  from xml.dom import pulldom 
 18  from xml.sax import SAXParseException 
 19   
 20  from Bio import Alphabet 
 21  from Bio.Seq import Seq 
 22  from Bio.Seq import UnknownSeq 
 23  from Bio.SeqRecord import SeqRecord 
 24  from Interfaces import SequentialSequenceWriter 
 25   
 26   
27 -class XMLRecordIterator:
28 """Base class for building iterators for record style XML formats. 29 30 It is assumed that all information for one record can be found within a 31 record element or above. Two types of methods are called when the start 32 tag of an element is reached. To receive only the attributes of an 33 element before its end tag is reached implement _attr_TAGNAME. 34 To get an element and its children as a DOM tree implement _elem_TAGNAME. 35 Everything that is part of the DOM tree will not trigger any further 36 method calls. 37 """ 38
39 - def __init__(self,handle,recordTag,namespace=None):
40 """Creating the object and initializing the XML parser.""" 41 42 self._recordTag=recordTag 43 self._namespace=namespace 44 self._events=pulldom.parse(handle)
45 46
47 - def __iter__(self):
48 """Iterate over the records in the XML file. 49 Returns the last parsed record.""" 50 51 record = None 52 try: 53 for event,node in self._events: 54 55 if event == "START_ELEMENT" and node.namespaceURI == self._namespace: 56 57 if node.localName == self._recordTag: 58 #create an empty SeqRecord 59 record = SeqRecord('', id='') 60 61 #call matching methods with attributes only 62 if hasattr(self,"_attr_" + node.localName): 63 getattr(self,"_attr_" + node.localName)(self._attributes(node),record) 64 65 #call matching methods with DOM tree 66 if hasattr(self,"_elem_" + node.localName): 67 #read the element and all nested elements into a DOM tree 68 self._events.expandNode(node) 69 node.normalize() 70 71 getattr(self,"_elem_" + node.localName)(node,record) 72 73 elif event == "END_ELEMENT" and node.namespaceURI == self._namespace and node.localName == self._recordTag: 74 yield record 75 76 except SAXParseException, e: 77 78 if e.getLineNumber() == 1 and e.getColumnNumber() == 0: 79 #empty file 80 pass 81 else: 82 import os 83 if e.getLineNumber() == 1 and e.getColumnNumber() == 1 \ 84 and os.name == "java": 85 #empty file, see http://bugs.jython.org/issue1774 86 pass 87 else: 88 raise
89 90
91 - def _attributes(self,node):
92 """Return the attributes of a DOM node as dictionary.""" 93 94 return dict( (node.attributes.item(i).name,node.attributes.item(i).value) for i in xrange(node.attributes.length) )
95 96 97
98 -class SeqXmlIterator(XMLRecordIterator):
99 """Breaks seqXML file into SeqRecords. 100 101 Assumes valid seqXML please validate beforehand.""" 102
103 - def __init__(self,handle):
104 """Create the object.""" 105 XMLRecordIterator.__init__(self, handle,"entry") 106 107 self._source = None 108 self._source_version = None 109 self._version = None 110 self._speciesName = None 111 self._ncbiTaxId = None
112
113 - def _attr_seqXML(self,attr_dict,record):
114 """Parse the document metadata.""" 115 116 if "source" in attr_dict: 117 self._source = attr_dict["source"] 118 if "sourceVersion" in attr_dict: 119 self._source_version = attr_dict["sourceVersion"] 120 if "version" in attr_dict: 121 self._version = attr_dict["seqXMLversion"] 122 if "ncbiTaxID" in attr_dict: 123 self._ncbiTaxId = attr_dict["ncbiTaxID"] 124 if "speciesName" in attr_dict: 125 self._speciesName = attr_dict["speciesName"]
126
127 - def _attr_property(self,attr_dict,record):
128 """Parse key value pair properties and store them as annotations.""" 129 130 if "name" not in attr_dict: 131 raise ValueError("Malformed property element.") 132 133 value = attr_dict.get("value",None) 134 135 if attr_dict["name"] not in record.annotations: 136 record.annotations[attr_dict["name"]] = value 137 elif isinstance(record.annotations[attr_dict["name"]],list): 138 record.annotations[attr_dict["name"]].append(value) 139 else: 140 record.annotations[attr_dict["name"]] = [record.annotations[attr_dict["name"]],value]
141 142
143 - def _attr_species(self,attr_dict,record):
144 """Parse the species information.""" 145 146 if "name" not in attr_dict or "ncbiTaxID" not in attr_dict: 147 raise ValueError("Malformed species element!") 148 149 #the keywords for the species annotation are taken from SwissIO 150 record.annotations["organism"] = attr_dict["name"] 151 record.annotations["ncbi_taxid"] = attr_dict["ncbiTaxID"]
152
153 - def _attr_entry(self,attr_dict,record):
154 """New entry set id and the optional entry source.""" 155 156 if "id" not in attr_dict: 157 raise ValueError("Malformed entry! Identifier is missing.") 158 159 record.id = attr_dict["id"] 160 if "source" in attr_dict: 161 record.annotations["source"] = attr_dict["source"] 162 elif self._source != None: 163 record.annotations["source"] = self._source 164 165 #initialize entry with global species definition 166 #the keywords for the species annotation are taken from SwissIO 167 if self._ncbiTaxId != None: 168 record.annotations["ncbi_taxid"] = self._ncbiTaxId 169 if self._speciesName != None: 170 record.annotations["organism"] = self._speciesName
171 172
173 - def _elem_DNAseq(self,node,record):
174 """Parse DNA sequence.""" 175 176 if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 177 raise ValueError("Sequence length should be greater than 0.") 178 179 record.seq = Seq(node.firstChild.data,Alphabet.generic_dna)
180 181
182 - def _elem_RNAseq(self,node,record):
183 """Parse RNA sequence.""" 184 185 if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 186 raise ValueError("Sequence length should be greater than 0.") 187 188 record.seq = Seq(node.firstChild.data,Alphabet.generic_rna)
189
190 - def _elem_AAseq(self,node,record):
191 """Parse protein sequence.""" 192 193 if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 194 raise ValueError("Sequence length should be greater than 0.") 195 196 record.seq = Seq(node.firstChild.data,Alphabet.generic_protein)
197 198
199 - def _elem_description(self,node,record):
200 """Parse the description.""" 201 202 if node.hasChildNodes() and len(node.firstChild.data) > 0: 203 record.description = node.firstChild.data
204
205 - def _attr_DBRef(self,attr_dict,record):
206 """Parse a database cross reference""" 207 208 if "source" not in attr_dict or "id" not in attr_dict: 209 raise ValueError("Invalid DB cross reference.") 210 211 if "%s:%s" % (attr_dict["source"],attr_dict["id"]) not in record.dbxrefs: 212 record.dbxrefs.append("%s:%s" % (attr_dict["source"],attr_dict["id"]) )
213 214 215
216 -class SeqXmlWriter(SequentialSequenceWriter):
217 """Writes SeqRecords into seqXML file. 218 219 SeqXML requires the sequence alphabet be explicitly RNA, DNA or protein, 220 i.e. an instance or subclass of Bio.Alphapet.RNAAlphabet, 221 Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet. 222 """ 223
224 - def __init__(self, handle,source=None,source_version=None,species=None,ncbiTaxId=None):
225 """Create Object and start the xml generator.""" 226 227 SequentialSequenceWriter.__init__(self, handle) 228 229 self.xml_generator = XMLGenerator(handle, "utf-8") 230 self.xml_generator.startDocument() 231 self.source = source 232 self.source_version = source_version 233 self.species = species 234 self.ncbiTaxId = ncbiTaxId
235
236 - def write_header(self):
237 """Write root node with document metadata.""" 238 SequentialSequenceWriter.write_header(self) 239 240 attrs = {"xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance", 241 "xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.4/seqxml.xsd", 242 "seqXMLversion":"0.4"} 243 244 if self.source != None: 245 attrs["source"] = self.source 246 if self.source_version != None: 247 attrs["sourceVersion"] = self.source_ersion 248 if self.species != None: 249 if not isinstance(species,basestring): 250 raise TypeError("species should be of type string") 251 attrs["speciesName"] = self.species 252 if self.ncbiTaxId != None: 253 if not isinstance(self.ncbiTaxId,(basestring,int)): 254 raise TypeError("ncbiTaxID should be of type string or int") 255 attrs["ncbiTaxID"] = self.ncbiTaxId 256 257 self.xml_generator.startElement("seqXML", AttributesImpl(attrs))
258 259
260 - def write_record(self, record):
261 """Write one record.""" 262 263 if not record.id or record.id == "<unknown id>": 264 raise ValueError("SeqXML requires identifier") 265 266 if not isinstance(record.id,basestring): 267 raise TypeError("Identifier should be of type string") 268 269 attrb = {"id" : record.id} 270 271 if "source" in record.annotations and self.source != record.annotations["source"]: 272 if not isinstance(record.annotations["source"],basestring): 273 raise TypeError("source should be of type string") 274 attrb["source"] = record.annotations["source"] 275 276 self.xml_generator.startElement("entry", AttributesImpl(attrb)) 277 self._write_species(record) 278 self._write_description(record) 279 self._write_seq(record) 280 self._write_dbxrefs(record) 281 self._write_properties(record) 282 self.xml_generator.endElement("entry")
283 291
292 - def _write_species(self,record):
293 """Write the species if given.""" 294 295 if "organism" in record.annotations and "ncbi_taxid" in record.annotations: 296 297 if not isinstance(record.annotations["organism"],basestring): 298 raise TypeError("organism should be of type string") 299 300 if not isinstance(record.annotations["ncbi_taxid"],(basestring,int)): 301 raise TypeError("ncbiTaxID should be of type string or int") 302 303 #The local species definition is only written if it differs from the global species definition 304 if record.annotations["organism"] != self.species or record.annotations["ncbi_taxid"] != self.ncbiTaxId: 305 306 attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] } 307 self.xml_generator.startElement("species",AttributesImpl(attr)) 308 self.xml_generator.endElement("species")
309 310
311 - def _write_description(self,record):
312 """Write the description if given.""" 313 314 if record.description: 315 316 if not isinstance(record.description,basestring): 317 raise TypeError("Description should be of type string") 318 319 description = record.description 320 if description == "<unknown description>": 321 description = "" 322 323 if len(record.description) > 0: 324 self.xml_generator.startElement("description",AttributesImpl( {} )) 325 self.xml_generator.characters(description) 326 self.xml_generator.endElement("description")
327
328 - def _write_seq(self,record):
329 """Write the sequence. 330 331 Note that SeqXML requires a DNA, RNA or protein alphabet. 332 """ 333 334 if isinstance(record.seq,UnknownSeq): 335 raise TypeError("Sequence type is UnknownSeq but SeqXML requires sequence") 336 337 seq = record.seq.tostring() 338 339 if not len(seq) > 0: 340 raise ValueError("The sequence length should be greater than 0") 341 342 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 343 alphabet = Alphabet._get_base_alphabet(record.seq.alphabet) 344 if isinstance(alphabet,Alphabet.RNAAlphabet): 345 seqElem = "RNAseq" 346 elif isinstance(alphabet,Alphabet.DNAAlphabet): 347 seqElem = "DNAseq" 348 elif isinstance(alphabet,Alphabet.ProteinAlphabet): 349 seqElem = "AAseq" 350 else: 351 raise ValueError("Need a DNA, RNA or Protein alphabet") 352 353 self.xml_generator.startElement(seqElem,AttributesImpl( {} )) 354 self.xml_generator.characters(seq) 355 self.xml_generator.endElement(seqElem)
356 357
358 - def _write_dbxrefs(self,record):
359 """Write all database cross references.""" 360 if record.dbxrefs != None: 361 362 for dbxref in record.dbxrefs: 363 364 if not isinstance(dbxref,basestring): 365 raise TypeError("dbxrefs should be of type list of string") 366 if dbxref.find(':') < 1: 367 raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]") 368 369 dbsource,dbid = dbxref.split(':',1) 370 371 attr = { "source" : dbsource, "id" : dbid } 372 self.xml_generator.startElement("DBRef",AttributesImpl(attr)) 373 self.xml_generator.endElement("DBRef")
374 375
376 - def _write_properties(self,record):
377 """Write all annotations that are key value pairs with values of a primitive type or list of primitive types.""" 378 379 for key,value in record.annotations.items(): 380 381 if key not in ("organism","ncbi_taxid","source"): 382 383 if value == None: 384 385 attr = { "name" : key } 386 self.xml_generator.startElement("property",AttributesImpl(attr)) 387 self.xml_generator.endElement("property") 388 389 elif isinstance(value,list): 390 391 for v in value: 392 if isinstance(value,(int,float,basestring)): 393 attr = { "name" : key , "value" : v } 394 self.xml_generator.startElement("property",AttributesImpl(attr)) 395 self.xml_generator.endElement("property") 396 397 elif isinstance(value,(int,float,basestring)): 398 399 attr = { "name" : key , "value" : str(value) } 400 self.xml_generator.startElement("property",AttributesImpl(attr)) 401 self.xml_generator.endElement("property")
402 403 if __name__ == "__main__": 404 print "Running quick self test" 405 406 from Bio import SeqIO 407 import sys 408 409 fileHandle = open("Tests/SeqXML/protein_example.xml","r") 410 records = list(SeqIO.parse(fileHandle, "seqxml")) 411 412 from StringIO import StringIO 413 stringHandle = StringIO() 414 415 SeqIO.write(records,stringHandle,"seqxml") 416 SeqIO.write(records,sys.stdout,"seqxml") 417 print 418 419 stringHandle.seek(0) 420 records = list(SeqIO.parse(stringHandle,"seqxml")) 421 422 SeqIO.write(records,sys.stdout,"seqxml") 423 print 424