Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST (direct use discouraged). 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15                      You are expected to use this via the parse or read functions. 
 16   
 17  _XMLParser          Generic SAX parser (private). 
 18   
 19  Functions: 
 20  parse               Incremental parser, this is an iterator that returns 
 21                      Blast records.  It uses the BlastParser internally. 
 22  read                Returns a single Blast record. Uses the BlastParser internally. 
 23  """ 
 24  from Bio.Blast import Record 
 25  import xml.sax 
 26  from xml.sax.handler import ContentHandler 
 27   
28 -class _XMLparser(ContentHandler):
29 """Generic SAX Parser 30 31 Just a very basic SAX parser. 32 33 Redefine the methods startElement, characters and endElement. 34 """
35 - def __init__(self, debug=0):
36 """Constructor 37 38 debug - integer, amount of debug information to print 39 """ 40 self._tag = [] 41 self._value = '' 42 self._debug = debug 43 self._debug_ignore_list = []
44
45 - def _secure_name(self, name):
46 """Removes 'dangerous' from tag names 47 48 name -- name to be 'secured' 49 """ 50 # Replace '-' with '_' in XML tag names 51 return name.replace('-', '_')
52
53 - def startElement(self, name, attr):
54 """Found XML start tag 55 56 No real need of attr, BLAST DTD doesn't use them 57 58 name -- name of the tag 59 60 attr -- tag attributes 61 """ 62 self._tag.append(name) 63 64 # Try to call a method (defined in subclasses) 65 method = self._secure_name('_start_' + name) 66 67 #Note could use try / except AttributeError 68 #BUT I found often triggered by nested errors... 69 if hasattr(self, method): 70 eval("self.%s()" % method) 71 if self._debug > 4: 72 print "NCBIXML: Parsed: " + method 73 else: 74 # Doesn't exist (yet) 75 if method not in self._debug_ignore_list: 76 if self._debug > 3: 77 print "NCBIXML: Ignored: " + method 78 self._debug_ignore_list.append(method) 79 80 #We don't care about white space in parent tags like Hsp, 81 #but that white space doesn't belong to child tags like Hsp_midline 82 if self._value.strip(): 83 raise ValueError("What should we do with %s before the %s tag?" \ 84 % (repr(self._value), name)) 85 self._value = ""
86
87 - def characters(self, ch):
88 """Found some text 89 90 ch -- characters read 91 """ 92 self._value += ch # You don't ever get the whole string
93
94 - def endElement(self, name):
95 """Found XML end tag 96 97 name -- tag name 98 """ 99 # DON'T strip any white space, we may need it e.g. the hsp-midline 100 101 # Try to call a method (defined in subclasses) 102 method = self._secure_name('_end_' + name) 103 #Note could use try / except AttributeError 104 #BUT I found often triggered by nested errors... 105 if hasattr(self, method): 106 eval("self.%s()" % method) 107 if self._debug > 2: 108 print "NCBIXML: Parsed: " + method, self._value 109 else: 110 # Doesn't exist (yet) 111 if method not in self._debug_ignore_list: 112 if self._debug > 1: 113 print "NCBIXML: Ignored: " + method, self._value 114 self._debug_ignore_list.append(method) 115 116 # Reset character buffer 117 self._value = ''
118
119 -class BlastParser(_XMLparser):
120 """Parse XML BLAST data into a Record.Blast object 121 122 All XML 'action' methods are private methods and may be: 123 _start_TAG called when the start tag is found 124 _end_TAG called when the end tag is found 125 """ 126
127 - def __init__(self, debug=0):
128 """Constructor 129 130 debug - integer, amount of debug information to print 131 """ 132 # Calling superclass method 133 _XMLparser.__init__(self, debug) 134 135 self._parser = xml.sax.make_parser() 136 self._parser.setContentHandler(self) 137 138 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 139 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 140 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 141 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 142 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 143 144 self.reset()
145
146 - def reset(self):
147 """Reset all the data allowing reuse of the BlastParser() object""" 148 self._records = [] 149 self._header = Record.Header() 150 self._parameters = Record.Parameters() 151 self._parameters.filter = None #Maybe I should update the class?
152
153 - def _start_Iteration(self):
154 self._blast = Record.Blast() 155 pass
156
157 - def _end_Iteration(self):
158 # We stored a lot of generic "top level" information 159 # in self._header (an object of type Record.Header) 160 self._blast.reference = self._header.reference 161 self._blast.date = self._header.date 162 self._blast.version = self._header.version 163 self._blast.database = self._header.database 164 self._blast.application = self._header.application 165 166 # These are required for "old" pre 2.2.14 files 167 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 168 # and <BlastOutput_query-len> were used. Now they 169 # are suplemented/replaced by <Iteration_query-ID>, 170 # <Iteration_query-def> and <Iteration_query-len> 171 if not hasattr(self._blast, "query") \ 172 or not self._blast.query: 173 self._blast.query = self._header.query 174 if not hasattr(self._blast, "query_id") \ 175 or not self._blast.query_id: 176 self._blast.query_id = self._header.query_id 177 if not hasattr(self._blast, "query_letters") \ 178 or not self._blast.query_letters: 179 self._blast.query_letters = self._header.query_letters 180 181 # Hack to record the query length as both the query_letters and 182 # query_length properties (as in the plain text parser, see 183 # Bug 2176 comment 12): 184 self._blast.query_length = self._blast.query_letters 185 # Perhaps in the long term we should deprecate one, but I would 186 # prefer to drop query_letters - so we need a transition period 187 # with both. 188 189 # Hack to record the claimed database size as database_length 190 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 191 self._blast.database_length = self._blast.num_letters_in_database 192 # TODO? Deprecate database_letters next? 193 194 # Hack to record the claimed database sequence count as database_sequences 195 self._blast.database_sequences = self._blast.num_sequences_in_database 196 197 # Apply the "top level" parameter information 198 self._blast.matrix = self._parameters.matrix 199 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 200 self._blast.gap_penalties = self._parameters.gap_penalties 201 self._blast.filter = self._parameters.filter 202 self._blast.expect = self._parameters.expect 203 self._blast.sc_match = self._parameters.sc_match 204 self._blast.sc_mismatch = self._parameters.sc_mismatch 205 206 #Add to the list 207 self._records.append(self._blast) 208 #Clear the object (a new empty one is create in _start_Iteration) 209 self._blast = None 210 211 if self._debug : "NCBIXML: Added Blast record to results"
212 213 # Header
214 - def _end_BlastOutput_program(self):
215 """BLAST program, e.g., blastp, blastn, etc. 216 217 Save this to put on each blast record object 218 """ 219 self._header.application = self._value.upper()
220
221 - def _end_BlastOutput_version(self):
222 """version number and date of the BLAST engine. 223 224 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 225 variants like "BLASTP 2.2.18+" without the date. 226 227 Save this to put on each blast record object 228 """ 229 parts = self._value.split() 230 #TODO - Check the first word starts with BLAST? 231 232 #The version is the second word (field one) 233 self._header.version = parts[1] 234 235 #Check there is a third word (the date) 236 if len(parts) >= 3: 237 if parts[2][0] == "[" and parts[2][-1] == "]": 238 self._header.date = parts[2][1:-1] 239 else: 240 #Assume this is still a date, but without the 241 #square brackets 242 self._header.date = parts[2]
243
245 """a reference to the article describing the algorithm 246 247 Save this to put on each blast record object 248 """ 249 self._header.reference = self._value
250
251 - def _end_BlastOutput_db(self):
252 """the database(s) searched 253 254 Save this to put on each blast record object 255 """ 256 self._header.database = self._value
257
259 """the identifier of the query 260 261 Important in old pre 2.2.14 BLAST, for recent versions 262 <Iteration_query-ID> is enough 263 """ 264 self._header.query_id = self._value
265
267 """the definition line of the query 268 269 Important in old pre 2.2.14 BLAST, for recent versions 270 <Iteration_query-def> is enough 271 """ 272 self._header.query = self._value
273
275 """the length of the query 276 277 Important in old pre 2.2.14 BLAST, for recent versions 278 <Iteration_query-len> is enough 279 """ 280 self._header.query_letters = int(self._value)
281
282 - def _end_Iteration_query_ID(self):
283 """the identifier of the query 284 """ 285 self._blast.query_id = self._value
286
287 - def _end_Iteration_query_def(self):
288 """the definition line of the query 289 """ 290 self._blast.query = self._value
291
292 - def _end_Iteration_query_len(self):
293 """the length of the query 294 """ 295 self._blast.query_letters = int(self._value)
296 297 ## def _end_BlastOutput_query_seq(self): 298 ## """the query sequence 299 ## """ 300 ## pass # XXX Missing in Record.Blast ? 301 302 ## def _end_BlastOutput_iter_num(self): 303 ## """the psi-blast iteration number 304 ## """ 305 ## pass # XXX TODO PSI 306
307 - def _end_BlastOutput_hits(self):
308 """hits to the database sequences, one for every sequence 309 """ 310 self._blast.num_hits = int(self._value)
311 312 ## def _end_BlastOutput_message(self): 313 ## """error messages 314 ## """ 315 ## pass # XXX What to do ? 316 317 # Parameters
318 - def _end_Parameters_matrix(self):
319 """matrix used (-M) 320 """ 321 self._parameters.matrix = self._value
322
323 - def _end_Parameters_expect(self):
324 """expect values cutoff (-e) 325 """ 326 # NOTE: In old text output there was a line: 327 # Number of sequences better than 1.0e-004: 1 328 # As far as I can see, parameters.num_seqs_better_e 329 # would take the value of 1, and the expectation 330 # value was not recorded. 331 # 332 # Anyway we should NOT record this against num_seqs_better_e 333 self._parameters.expect = self._value
334 335 ## def _end_Parameters_include(self): 336 ## """inclusion threshold for a psi-blast iteration (-h) 337 ## """ 338 ## pass # XXX TODO PSI 339
340 - def _end_Parameters_sc_match(self):
341 """match score for nucleotide-nucleotide comparaison (-r) 342 """ 343 self._parameters.sc_match = int(self._value)
344
346 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 347 """ 348 self._parameters.sc_mismatch = int(self._value)
349
350 - def _end_Parameters_gap_open(self):
351 """gap existence cost (-G) 352 """ 353 self._parameters.gap_penalties = int(self._value)
354
356 """gap extension cose (-E) 357 """ 358 self._parameters.gap_penalties = (self._parameters.gap_penalties, 359 int(self._value))
360
361 - def _end_Parameters_filter(self):
362 """filtering options (-F) 363 """ 364 self._parameters.filter = self._value
365 366 ## def _end_Parameters_pattern(self): 367 ## """pattern used for phi-blast search 368 ## """ 369 ## pass # XXX TODO PSI 370 371 ## def _end_Parameters_entrez_query(self): 372 ## """entrez query used to limit search 373 ## """ 374 ## pass # XXX TODO PSI 375 376 # Hits
377 - def _start_Hit(self):
378 self._blast.alignments.append(Record.Alignment()) 379 self._blast.descriptions.append(Record.Description()) 380 self._blast.multiple_alignment = [] 381 self._hit = self._blast.alignments[-1] 382 self._descr = self._blast.descriptions[-1] 383 self._descr.num_alignments = 0
384
385 - def _end_Hit(self):
386 #Cleanup 387 self._blast.multiple_alignment = None 388 self._hit = None 389 self._descr = None
390
391 - def _end_Hit_id(self):
392 """identifier of the database sequence 393 """ 394 self._hit.hit_id = self._value 395 self._hit.title = self._value + ' '
396
397 - def _end_Hit_def(self):
398 """definition line of the database sequence 399 """ 400 self._hit.hit_def = self._value 401 self._hit.title += self._value 402 self._descr.title = self._hit.title
403
404 - def _end_Hit_accession(self):
405 """accession of the database sequence 406 """ 407 self._hit.accession = self._value 408 self._descr.accession = self._value
409
410 - def _end_Hit_len(self):
411 self._hit.length = int(self._value)
412 413 # HSPs
414 - def _start_Hsp(self):
415 #Note that self._start_Hit() should have been called 416 #to setup things like self._blast.multiple_alignment 417 self._hit.hsps.append(Record.HSP()) 418 self._hsp = self._hit.hsps[-1] 419 self._descr.num_alignments += 1 420 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 421 self._mult_al = self._blast.multiple_alignment[-1]
422 423 # Hsp_num is useless
424 - def _end_Hsp_score(self):
425 """raw score of HSP 426 """ 427 self._hsp.score = float(self._value) 428 if self._descr.score == None: 429 self._descr.score = float(self._value)
430
431 - def _end_Hsp_bit_score(self):
432 """bit score of HSP 433 """ 434 self._hsp.bits = float(self._value) 435 if self._descr.bits == None: 436 self._descr.bits = float(self._value)
437
438 - def _end_Hsp_evalue(self):
439 """expect value value of the HSP 440 """ 441 self._hsp.expect = float(self._value) 442 if self._descr.e == None: 443 self._descr.e = float(self._value)
444
445 - def _end_Hsp_query_from(self):
446 """offset of query at the start of the alignment (one-offset) 447 """ 448 self._hsp.query_start = int(self._value)
449
450 - def _end_Hsp_query_to(self):
451 """offset of query at the end of the alignment (one-offset) 452 """ 453 self._hsp.query_end = int(self._value)
454
455 - def _end_Hsp_hit_from(self):
456 """offset of the database at the start of the alignment (one-offset) 457 """ 458 self._hsp.sbjct_start = int(self._value)
459
460 - def _end_Hsp_hit_to(self):
461 """offset of the database at the end of the alignment (one-offset) 462 """ 463 self._hsp.sbjct_end = int(self._value)
464 465 ## def _end_Hsp_pattern_from(self): 466 ## """start of phi-blast pattern on the query (one-offset) 467 ## """ 468 ## pass # XXX TODO PSI 469 470 ## def _end_Hsp_pattern_to(self): 471 ## """end of phi-blast pattern on the query (one-offset) 472 ## """ 473 ## pass # XXX TODO PSI 474
475 - def _end_Hsp_query_frame(self):
476 """frame of the query if applicable 477 """ 478 self._hsp.frame = (int(self._value),)
479
480 - def _end_Hsp_hit_frame(self):
481 """frame of the database sequence if applicable 482 """ 483 self._hsp.frame += (int(self._value),)
484
485 - def _end_Hsp_identity(self):
486 """number of identities in the alignment 487 """ 488 self._hsp.identities = int(self._value)
489
490 - def _end_Hsp_positive(self):
491 """number of positive (conservative) substitutions in the alignment 492 """ 493 self._hsp.positives = int(self._value)
494
495 - def _end_Hsp_gaps(self):
496 """number of gaps in the alignment 497 """ 498 self._hsp.gaps = int(self._value)
499
500 - def _end_Hsp_align_len(self):
501 """length of the alignment 502 """ 503 self._hsp.align_length = int(self._value)
504 505 ## def _en_Hsp_density(self): 506 ## """score density 507 ## """ 508 ## pass # XXX ??? 509
510 - def _end_Hsp_qseq(self):
511 """alignment string for the query 512 """ 513 self._hsp.query = self._value
514
515 - def _end_Hsp_hseq(self):
516 """alignment string for the database 517 """ 518 self._hsp.sbjct = self._value
519
520 - def _end_Hsp_midline(self):
521 """Formatting middle line as normally seen in BLAST report 522 """ 523 self._hsp.match = self._value # do NOT strip spaces! 524 assert len(self._hsp.match)==len(self._hsp.query) 525 assert len(self._hsp.match)==len(self._hsp.sbjct)
526 527 # Statistics
528 - def _end_Statistics_db_num(self):
529 """number of sequences in the database 530 """ 531 self._blast.num_sequences_in_database = int(self._value)
532
533 - def _end_Statistics_db_len(self):
534 """number of letters in the database 535 """ 536 self._blast.num_letters_in_database = int(self._value)
537
538 - def _end_Statistics_hsp_len(self):
539 """the effective HSP length 540 """ 541 self._blast.effective_hsp_length = int(self._value)
542
544 """the effective search space 545 """ 546 self._blast.effective_search_space = float(self._value)
547
548 - def _end_Statistics_kappa(self):
549 """Karlin-Altschul parameter K 550 """ 551 self._blast.ka_params = float(self._value)
552
553 - def _end_Statistics_lambda(self):
554 """Karlin-Altschul parameter Lambda 555 """ 556 self._blast.ka_params = (float(self._value), 557 self._blast.ka_params)
558
559 - def _end_Statistics_entropy(self):
560 """Karlin-Altschul parameter H 561 """ 562 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
563
564 -def read(handle, debug=0):
565 """Returns a single Blast record (assumes just one query). 566 567 This function is for use when there is one and only one BLAST 568 result in your XML file. 569 570 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 571 one BLAST record (i.e. if you have more than one query sequence). 572 573 """ 574 iterator = parse(handle, debug) 575 try: 576 first = iterator.next() 577 except StopIteration: 578 first = None 579 if first is None: 580 raise ValueError("No records found in handle") 581 try: 582 second = iterator.next() 583 except StopIteration: 584 second = None 585 if second is not None: 586 raise ValueError("More than one record found in handle") 587 return first
588 589
590 -def parse(handle, debug=0):
591 """Returns an iterator a Blast record for each query. 592 593 handle - file handle to and XML file to parse 594 debug - integer, amount of debug information to print 595 596 This is a generator function that returns multiple Blast records 597 objects - one for each query sequence given to blast. The file 598 is read incrementally, returning complete records as they are read 599 in. 600 601 Should cope with new BLAST 2.2.14+ which gives a single XML file 602 for mutliple query records. 603 604 Should also cope with XML output from older versions BLAST which 605 gave multiple XML files concatenated together (giving a single file 606 which strictly speaking wasn't valid XML).""" 607 from xml.parsers import expat 608 BLOCK = 1024 609 MARGIN = 10 # must be at least length of newline + XML start 610 XML_START = "<?xml" 611 612 text = handle.read(BLOCK) 613 pending = "" 614 615 if not text: 616 #NO DATA FOUND! 617 raise ValueError("Your XML file was empty") 618 619 while text: 620 #We are now starting a new XML file 621 if not text.startswith(XML_START): 622 raise ValueError("Your XML file did not start with %s... " 623 "but instead %s" \ 624 % (XML_START, repr(text[:20]))) 625 626 expat_parser = expat.ParserCreate() 627 blast_parser = BlastParser(debug) 628 expat_parser.StartElementHandler = blast_parser.startElement 629 expat_parser.EndElementHandler = blast_parser.endElement 630 expat_parser.CharacterDataHandler = blast_parser.characters 631 632 expat_parser.Parse(text, False) 633 while blast_parser._records: 634 record = blast_parser._records[0] 635 blast_parser._records = blast_parser._records[1:] 636 yield record 637 638 while True: 639 #Read in another block of the file... 640 text, pending = pending + handle.read(BLOCK), "" 641 if not text: 642 #End of the file! 643 expat_parser.Parse("", True) # End of XML record 644 break 645 646 #Now read a little bit more so we can check for the 647 #start of another XML file... 648 pending = handle.read(MARGIN) 649 650 if (text+pending).find("\n" + XML_START) == -1: 651 # Good - still dealing with the same XML file 652 expat_parser.Parse(text, False) 653 while blast_parser._records: 654 yield blast_parser._records.pop(0) 655 else: 656 # This is output from pre 2.2.14 BLAST, 657 # one XML file for each query! 658 659 # Finish the old file: 660 text, pending = (text+pending).split("\n" + XML_START,1) 661 pending = XML_START + pending 662 663 expat_parser.Parse(text, True) # End of XML record 664 while blast_parser._records: 665 yield blast_parser._records.pop(0) 666 667 #Now we are going to re-loop, reset the 668 #parsers and start reading the next XML file 669 text, pending = pending, "" 670 break 671 672 #this was added because it seems that the Jython expat parser 673 #was adding records later then the Python one 674 while blast_parser._records: 675 yield blast_parser._records.pop(0) 676 677 #At this point we have finished the first XML record. 678 #If the file is from an old version of blast, it may 679 #contain more XML records (check if text==""). 680 assert pending=="" 681 assert len(blast_parser._records) == 0 682 683 #We should have finished the file! 684 assert text=="" 685 assert pending=="" 686 assert len(blast_parser._records) == 0
687 688 if __name__ == '__main__': 689 import sys 690 import os 691 handle = open(sys.argv[1]) 692 r_list = parse(handle) 693 694 for r in r_list: 695 # Small test 696 print 'Blast of', r.query 697 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments), 698 reduce(lambda a,b: a+b, 699 [len(a.hsps) for a in r.alignments])) 700 701 for al in r.alignments: 702 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs' 703 704 # Cookbook example 705 E_VALUE_THRESH = 0.04 706 for alignment in r.alignments: 707 for hsp in alignment.hsps: 708 if hsp.expect < E_VALUE_THRESH: 709 print '*****' 710 print 'sequence', alignment.title 711 print 'length', alignment.length 712 print 'e value', hsp.expect 713 print hsp.query[:75] + '...' 714 print hsp.match[:75] + '...' 715 print hsp.sbjct[:75] + '...' 716