1
2
3
4
5
6 """Bio.SeqIO support for the "seqxml" file format, SeqXML.
7
8 You are expected to use this module via the Bio.SeqIO functions.
9
10 SeqXML is a lightweight XML format which is supposed be an alternative for
11 FASTA files. For more Information see http://www.seqXML.org and Schmitt et al
12 (2011), http://dx.doi.org/10.1093/bib/bbr025
13 """
14
15 from xml.sax.saxutils import XMLGenerator
16 from xml.sax.xmlreader import AttributesImpl
17 from xml.dom import pulldom
18 from xml.sax import SAXParseException
19
20 from Bio import Alphabet
21 from Bio.Seq import Seq
22 from Bio.Seq import UnknownSeq
23 from Bio.SeqRecord import SeqRecord
24 from Interfaces import SequentialSequenceWriter
25
26
28 """Base class for building iterators for record style XML formats.
29
30 It is assumed that all information for one record can be found within a
31 record element or above. Two types of methods are called when the start
32 tag of an element is reached. To receive only the attributes of an
33 element before its end tag is reached implement _attr_TAGNAME.
34 To get an element and its children as a DOM tree implement _elem_TAGNAME.
35 Everything that is part of the DOM tree will not trigger any further
36 method calls.
37 """
38
39 - def __init__(self,handle,recordTag,namespace=None):
40 """Creating the object and initializing the XML parser."""
41
42 self._recordTag=recordTag
43 self._namespace=namespace
44 self._events=pulldom.parse(handle)
45
46
48 """Iterate over the records in the XML file.
49 Returns the last parsed record."""
50
51 record = None
52 try:
53 for event,node in self._events:
54
55 if event == "START_ELEMENT" and node.namespaceURI == self._namespace:
56
57 if node.localName == self._recordTag:
58
59 record = SeqRecord('', id='')
60
61
62 if hasattr(self,"_attr_" + node.localName):
63 getattr(self,"_attr_" + node.localName)(self._attributes(node),record)
64
65
66 if hasattr(self,"_elem_" + node.localName):
67
68 self._events.expandNode(node)
69 node.normalize()
70
71 getattr(self,"_elem_" + node.localName)(node,record)
72
73 elif event == "END_ELEMENT" and node.namespaceURI == self._namespace and node.localName == self._recordTag:
74 yield record
75
76 except SAXParseException, e:
77
78 if e.getLineNumber() == 1 and e.getColumnNumber() == 0:
79
80 pass
81 else:
82 import os
83 if e.getLineNumber() == 1 and e.getColumnNumber() == 1 \
84 and os.name == "java":
85
86 pass
87 else:
88 raise
89
90
92 """Return the attributes of a DOM node as dictionary."""
93
94 return dict( (node.attributes.item(i).name,node.attributes.item(i).value) for i in xrange(node.attributes.length) )
95
96
97
99 """Breaks seqXML file into SeqRecords.
100
101 Assumes valid seqXML please validate beforehand."""
102
104 """Create the object."""
105 XMLRecordIterator.__init__(self, handle,"entry")
106
107 self._source = None
108 self._source_version = None
109 self._version = None
110 self._speciesName = None
111 self._ncbiTaxId = None
112
114 """Parse the document metadata."""
115
116 if "source" in attr_dict:
117 self._source = attr_dict["source"]
118 if "sourceVersion" in attr_dict:
119 self._source_version = attr_dict["sourceVersion"]
120 if "version" in attr_dict:
121 self._version = attr_dict["seqXMLversion"]
122 if "ncbiTaxID" in attr_dict:
123 self._ncbiTaxId = attr_dict["ncbiTaxID"]
124 if "speciesName" in attr_dict:
125 self._speciesName = attr_dict["speciesName"]
126
128 """Parse key value pair properties and store them as annotations."""
129
130 if "name" not in attr_dict:
131 raise ValueError("Malformed property element.")
132
133 value = attr_dict.get("value",None)
134
135 if attr_dict["name"] not in record.annotations:
136 record.annotations[attr_dict["name"]] = value
137 elif isinstance(record.annotations[attr_dict["name"]],list):
138 record.annotations[attr_dict["name"]].append(value)
139 else:
140 record.annotations[attr_dict["name"]] = [record.annotations[attr_dict["name"]],value]
141
142
144 """Parse the species information."""
145
146 if "name" not in attr_dict or "ncbiTaxID" not in attr_dict:
147 raise ValueError("Malformed species element!")
148
149
150 record.annotations["organism"] = attr_dict["name"]
151 record.annotations["ncbi_taxid"] = attr_dict["ncbiTaxID"]
152
153 - def _attr_entry(self,attr_dict,record):
154 """New entry set id and the optional entry source."""
155
156 if "id" not in attr_dict:
157 raise ValueError("Malformed entry! Identifier is missing.")
158
159 record.id = attr_dict["id"]
160 if "source" in attr_dict:
161 record.annotations["source"] = attr_dict["source"]
162 elif self._source != None:
163 record.annotations["source"] = self._source
164
165
166
167 if self._ncbiTaxId != None:
168 record.annotations["ncbi_taxid"] = self._ncbiTaxId
169 if self._speciesName != None:
170 record.annotations["organism"] = self._speciesName
171
172
174 """Parse DNA sequence."""
175
176 if not (node.hasChildNodes() and len(node.firstChild.data) > 0):
177 raise ValueError("Sequence length should be greater than 0.")
178
179 record.seq = Seq(node.firstChild.data,Alphabet.generic_dna)
180
181
183 """Parse RNA sequence."""
184
185 if not (node.hasChildNodes() and len(node.firstChild.data) > 0):
186 raise ValueError("Sequence length should be greater than 0.")
187
188 record.seq = Seq(node.firstChild.data,Alphabet.generic_rna)
189
197
198
204
206 """Parse a database cross reference"""
207
208 if "source" not in attr_dict or "id" not in attr_dict:
209 raise ValueError("Invalid DB cross reference.")
210
211 if "%s:%s" % (attr_dict["source"],attr_dict["id"]) not in record.dbxrefs:
212 record.dbxrefs.append("%s:%s" % (attr_dict["source"],attr_dict["id"]) )
213
214
215
217 """Writes SeqRecords into seqXML file.
218
219 SeqXML requires the sequence alphabet be explicitly RNA, DNA or protein,
220 i.e. an instance or subclass of Bio.Alphapet.RNAAlphabet,
221 Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet.
222 """
223
224 - def __init__(self, handle,source=None,source_version=None,species=None,ncbiTaxId=None):
235
237 """Write root node with document metadata."""
238 SequentialSequenceWriter.write_header(self)
239
240 attrs = {"xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
241 "xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.4/seqxml.xsd",
242 "seqXMLversion":"0.4"}
243
244 if self.source != None:
245 attrs["source"] = self.source
246 if self.source_version != None:
247 attrs["sourceVersion"] = self.source_ersion
248 if self.species != None:
249 if not isinstance(species,basestring):
250 raise TypeError("species should be of type string")
251 attrs["speciesName"] = self.species
252 if self.ncbiTaxId != None:
253 if not isinstance(self.ncbiTaxId,(basestring,int)):
254 raise TypeError("ncbiTaxID should be of type string or int")
255 attrs["ncbiTaxID"] = self.ncbiTaxId
256
257 self.xml_generator.startElement("seqXML", AttributesImpl(attrs))
258
259
261 """Write one record."""
262
263 if not record.id or record.id == "<unknown id>":
264 raise ValueError("SeqXML requires identifier")
265
266 if not isinstance(record.id,basestring):
267 raise TypeError("Identifier should be of type string")
268
269 attrb = {"id" : record.id}
270
271 if "source" in record.annotations and self.source != record.annotations["source"]:
272 if not isinstance(record.annotations["source"],basestring):
273 raise TypeError("source should be of type string")
274 attrb["source"] = record.annotations["source"]
275
276 self.xml_generator.startElement("entry", AttributesImpl(attrb))
277 self._write_species(record)
278 self._write_description(record)
279 self._write_seq(record)
280 self._write_dbxrefs(record)
281 self._write_properties(record)
282 self.xml_generator.endElement("entry")
283
291
293 """Write the species if given."""
294
295 if "organism" in record.annotations and "ncbi_taxid" in record.annotations:
296
297 if not isinstance(record.annotations["organism"],basestring):
298 raise TypeError("organism should be of type string")
299
300 if not isinstance(record.annotations["ncbi_taxid"],(basestring,int)):
301 raise TypeError("ncbiTaxID should be of type string or int")
302
303
304 if record.annotations["organism"] != self.species or record.annotations["ncbi_taxid"] != self.ncbiTaxId:
305
306 attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] }
307 self.xml_generator.startElement("species",AttributesImpl(attr))
308 self.xml_generator.endElement("species")
309
310
327
329 """Write the sequence.
330
331 Note that SeqXML requires a DNA, RNA or protein alphabet.
332 """
333
334 if isinstance(record.seq,UnknownSeq):
335 raise TypeError("Sequence type is UnknownSeq but SeqXML requires sequence")
336
337 seq = record.seq.tostring()
338
339 if not len(seq) > 0:
340 raise ValueError("The sequence length should be greater than 0")
341
342
343 alphabet = Alphabet._get_base_alphabet(record.seq.alphabet)
344 if isinstance(alphabet,Alphabet.RNAAlphabet):
345 seqElem = "RNAseq"
346 elif isinstance(alphabet,Alphabet.DNAAlphabet):
347 seqElem = "DNAseq"
348 elif isinstance(alphabet,Alphabet.ProteinAlphabet):
349 seqElem = "AAseq"
350 else:
351 raise ValueError("Need a DNA, RNA or Protein alphabet")
352
353 self.xml_generator.startElement(seqElem,AttributesImpl( {} ))
354 self.xml_generator.characters(seq)
355 self.xml_generator.endElement(seqElem)
356
357
359 """Write all database cross references."""
360 if record.dbxrefs != None:
361
362 for dbxref in record.dbxrefs:
363
364 if not isinstance(dbxref,basestring):
365 raise TypeError("dbxrefs should be of type list of string")
366 if dbxref.find(':') < 1:
367 raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]")
368
369 dbsource,dbid = dbxref.split(':',1)
370
371 attr = { "source" : dbsource, "id" : dbid }
372 self.xml_generator.startElement("DBRef",AttributesImpl(attr))
373 self.xml_generator.endElement("DBRef")
374
375
377 """Write all annotations that are key value pairs with values of a primitive type or list of primitive types."""
378
379 for key,value in record.annotations.items():
380
381 if key not in ("organism","ncbi_taxid","source"):
382
383 if value == None:
384
385 attr = { "name" : key }
386 self.xml_generator.startElement("property",AttributesImpl(attr))
387 self.xml_generator.endElement("property")
388
389 elif isinstance(value,list):
390
391 for v in value:
392 if isinstance(value,(int,float,basestring)):
393 attr = { "name" : key , "value" : v }
394 self.xml_generator.startElement("property",AttributesImpl(attr))
395 self.xml_generator.endElement("property")
396
397 elif isinstance(value,(int,float,basestring)):
398
399 attr = { "name" : key , "value" : str(value) }
400 self.xml_generator.startElement("property",AttributesImpl(attr))
401 self.xml_generator.endElement("property")
402
403 if __name__ == "__main__":
404 print "Running quick self test"
405
406 from Bio import SeqIO
407 import sys
408
409 fileHandle = open("Tests/SeqXML/protein_example.xml","r")
410 records = list(SeqIO.parse(fileHandle, "seqxml"))
411
412 from StringIO import StringIO
413 stringHandle = StringIO()
414
415 SeqIO.write(records,stringHandle,"seqxml")
416 SeqIO.write(records,sys.stdout,"seqxml")
417 print
418
419 stringHandle.seek(0)
420 records = list(SeqIO.parse(stringHandle,"seqxml"))
421
422 SeqIO.write(records,sys.stdout,"seqxml")
423 print
424