Package Bio :: Package Parsers :: Module spark
[hide private]
[frames] | no frames]

Source Code for Module Bio.Parsers.spark

  1  #  Copyright (c) 1998-2000 John Aycock 
  2  #   
  3  #  Permission is hereby granted, free of charge, to any person obtaining 
  4  #  a copy of this software and associated documentation files (the 
  5  #  "Software"), to deal in the Software without restriction, including 
  6  #  without limitation the rights to use, copy, modify, merge, publish, 
  7  #  distribute, sublicense, and/or sell copies of the Software, and to 
  8  #  permit persons to whom the Software is furnished to do so, subject to 
  9  #  the following conditions: 
 10  #   
 11  #  The above copyright notice and this permission notice shall be 
 12  #  included in all copies or substantial portions of the Software. 
 13  #   
 14  #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 15  #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 16  #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
 17  #  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
 18  #  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 19  #  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 20  #  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
 21   
 22  """Copy of John Aycock's SPARK parser included with Biopython (DEPRECATED). 
 23   
 24  To clarify, when we say deprecated we just mean to warn you that Biopython will 
 25  in a future release no longer include this module. This does not affect the 
 26  status of John Aycock's SPARK parser which can be installed separately from: 
 27  http://pages.cpsc.ucalgary.ca/~aycock/spark/ 
 28   
 29  Biopython included this copy of SPARK purely for parsing GenBank/EMBL feature 
 30  location strings using Bio.GenBank.LocationParser, and that code has now been 
 31  replaced with something simpler and faster using regular expressions. 
 32  """ 
 33  # Don't issue a deprecation warning here, but via Bio.Parsers instead 
 34  # This avoids the user seeing multiple deprecation warnings. 
 35   
 36  __version__ = 'SPARK-0.6.1' 
 37   
 38  import re 
 39  import sys 
 40   
41 -def _namelist(instance):
42 namelist, namedict, classlist = [], {}, [instance.__class__] 43 for c in classlist: 44 for b in c.__bases__: 45 classlist.append(b) 46 for name in dir(c): 47 if name not in namedict: 48 namelist.append(name) 49 namedict[name] = 1 50 return namelist
51
52 -class GenericScanner(object):
53 - def __init__(self):
54 pattern = self.reflect() 55 self.re = re.compile(pattern, re.VERBOSE) 56 57 self.index2func = {} 58 for name, number in self.re.groupindex.items(): 59 self.index2func[number-1] = getattr(self, 't_' + name)
60
61 - def makeRE(self, name):
62 doc = getattr(self, name).__doc__ 63 rv = '(?P<%s>%s)' % (name[2:], doc) 64 return rv
65
66 - def reflect(self):
67 rv = [] 68 for name in _namelist(self): 69 if name[:2] == 't_' and name != 't_default': 70 rv.append(self.makeRE(name)) 71 72 rv.append(self.makeRE('t_default')) 73 return '|'.join(rv)
74
75 - def error(self, s, pos):
76 print >>sys.stderr, "Lexical error at position %s" % pos 77 raise SystemExit
78
79 - def tokenize(self, s):
80 pos = 0 81 n = len(s) 82 while pos < n: 83 m = self.re.match(s, pos) 84 if m is None: 85 self.error(s, pos) 86 87 groups = m.groups() 88 for i in range(len(groups)): 89 if groups[i] and i in self.index2func: 90 self.index2func[i](groups[i]) 91 pos = m.end()
92
93 - def t_default(self, s):
94 r'( . | \n )+' 95 pass
96
97 -class GenericParser(object):
98 - def __init__(self, start):
99 self.rules = {} 100 self.rule2func = {} 101 self.rule2name = {} 102 self.collectRules() 103 self.startRule = self.augment(start) 104 self.ruleschanged = 1
105 106 _START = 'START' 107 _EOF = 'EOF' 108 109 # 110 # A hook for GenericASTBuilder and GenericASTMatcher. 111 #
112 - def preprocess(self, rule, func): return rule, func
113
114 - def addRule(self, doc, func):
115 rules = doc.split() 116 117 index = [] 118 for i in range(len(rules)): 119 if rules[i] == '::=': 120 index.append(i-1) 121 index.append(len(rules)) 122 123 for i in range(len(index)-1): 124 lhs = rules[index[i]] 125 rhs = rules[index[i]+2:index[i+1]] 126 rule = (lhs, tuple(rhs)) 127 128 rule, fn = self.preprocess(rule, func) 129 130 if lhs in self.rules: 131 self.rules[lhs].append(rule) 132 else: 133 self.rules[lhs] = [ rule ] 134 self.rule2func[rule] = fn 135 self.rule2name[rule] = func.__name__[2:] 136 self.ruleschanged = 1
137
138 - def collectRules(self):
139 for name in _namelist(self): 140 if name[:2] == 'p_': 141 func = getattr(self, name) 142 doc = func.__doc__ 143 self.addRule(doc, func)
144
145 - def augment(self, start):
146 # 147 # Tempting though it is, this isn't made into a call 148 # to self.addRule() because the start rule shouldn't 149 # be subject to preprocessing. 150 # 151 startRule = (self._START, ( start, self._EOF )) 152 self.rule2func[startRule] = lambda args: args[0] 153 self.rules[self._START] = [ startRule ] 154 self.rule2name[startRule] = '' 155 return startRule
156
157 - def makeFIRST(self):
158 union = {} 159 self.first = {} 160 161 for rulelist in self.rules.values(): 162 for lhs, rhs in rulelist: 163 if lhs not in self.first: 164 self.first[lhs] = {} 165 166 if len(rhs) == 0: 167 self.first[lhs][None] = 1 168 continue 169 170 sym = rhs[0] 171 if sym not in self.rules: 172 self.first[lhs][sym] = 1 173 else: 174 union[(sym, lhs)] = 1 175 changes = 1 176 while changes: 177 changes = 0 178 for src, dest in union.keys(): 179 destlen = len(self.first[dest]) 180 self.first[dest].update(self.first[src]) 181 if len(self.first[dest]) != destlen: 182 changes = 1
183 184 # 185 # An Earley parser, as per J. Earley, "An Efficient Context-Free 186 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, 187 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, 188 # Carnegie-Mellon University, August 1968, p. 27. 189 # 190
191 - def typestring(self, token):
192 return None
193
194 - def error(self, token):
195 print >>sys.stderr, "Syntax error at or near `%s' token" % token 196 raise SystemExit
197
198 - def parse(self, tokens):
199 tree = {} 200 tokens.append(self._EOF) 201 states = { 0: [ (self.startRule, 0, 0) ] } 202 203 if self.ruleschanged: 204 self.makeFIRST() 205 206 for i in xrange(len(tokens)): 207 states[i+1] = [] 208 209 if states[i] == []: 210 break 211 self.buildState(tokens[i], states, i, tree) 212 213 #_dump(tokens, states) 214 215 if i < len(tokens)-1 or states[i+1] != [(self.startRule, 2, 0)]: 216 del tokens[-1] 217 self.error(tokens[i-1]) 218 rv = self.buildTree(tokens, tree, ((self.startRule, 2, 0), i+1)) 219 del tokens[-1] 220 return rv
221
222 - def buildState(self, token, states, i, tree):
223 needsCompletion = {} 224 state = states[i] 225 predicted = {} 226 227 for item in state: 228 rule, pos, parent = item 229 lhs, rhs = rule 230 231 # 232 # A -> a . (completer) 233 # 234 if pos == len(rhs): 235 if len(rhs) == 0: 236 needsCompletion[lhs] = (item, i) 237 238 for pitem in states[parent]: 239 if pitem is item: 240 break 241 242 prule, ppos, pparent = pitem 243 plhs, prhs = prule 244 245 if prhs[ppos:ppos+1] == (lhs,): 246 new = (prule, 247 ppos+1, 248 pparent) 249 if new not in state: 250 state.append(new) 251 tree[(new, i)] = [(item, i)] 252 else: 253 tree[(new, i)].append((item, i)) 254 continue 255 256 nextSym = rhs[pos] 257 258 # 259 # A -> a . B (predictor) 260 # 261 if nextSym in self.rules: 262 # 263 # Work on completer step some more; for rules 264 # with empty RHS, the "parent state" is the 265 # current state we're adding Earley items to, 266 # so the Earley items the completer step needs 267 # may not all be present when it runs. 268 # 269 if nextSym in needsCompletion: 270 new = (rule, pos+1, parent) 271 olditem_i = needsCompletion[nextSym] 272 if new not in state: 273 state.append(new) 274 tree[(new, i)] = [olditem_i] 275 else: 276 tree[(new, i)].append(olditem_i) 277 278 # 279 # Has this been predicted already? 280 # 281 if nextSym in predicted: 282 continue 283 predicted[nextSym] = 1 284 285 ttype = token is not self._EOF and \ 286 self.typestring(token) or \ 287 None 288 if ttype is not None: 289 # 290 # Even smarter predictor, when the 291 # token's type is known. The code is 292 # grungy, but runs pretty fast. Three 293 # cases are looked for: rules with 294 # empty RHS; first symbol on RHS is a 295 # terminal; first symbol on RHS is a 296 # nonterminal (and isn't nullable). 297 # 298 for prule in self.rules[nextSym]: 299 new = (prule, 0, i) 300 prhs = prule[1] 301 if len(prhs) == 0: 302 state.append(new) 303 continue 304 prhs0 = prhs[0] 305 if prhs0 not in self.rules: 306 if prhs0 != ttype: 307 continue 308 else: 309 state.append(new) 310 continue 311 first = self.first[prhs0] 312 if None not in first and \ 313 ttype not in first: 314 continue 315 state.append(new) 316 continue 317 318 for prule in self.rules[nextSym]: 319 # 320 # Smarter predictor, as per Grune & 321 # Jacobs' _Parsing Techniques_. Not 322 # as good as FIRST sets though. 323 # 324 prhs = prule[1] 325 if len(prhs) > 0 and \ 326 prhs[0] not in self.rules and \ 327 token != prhs[0]: 328 continue 329 state.append((prule, 0, i)) 330 331 # 332 # A -> a . c (scanner) 333 # 334 elif token == nextSym: 335 #assert new not in states[i+1] 336 states[i+1].append((rule, pos+1, parent))
337
338 - def buildTree(self, tokens, tree, root):
339 stack = [] 340 self.buildTree_r(stack, tokens, -1, tree, root) 341 return stack[0]
342
343 - def buildTree_r(self, stack, tokens, tokpos, tree, root):
344 (rule, pos, parent), state = root 345 346 while pos > 0: 347 want = ((rule, pos, parent), state) 348 if want not in tree: 349 # 350 # Since pos > 0, it didn't come from closure, 351 # and if it isn't in tree[], then there must 352 # be a terminal symbol to the left of the dot. 353 # (It must be from a "scanner" step.) 354 # 355 pos = pos - 1 356 state = state - 1 357 stack.insert(0, tokens[tokpos]) 358 tokpos = tokpos - 1 359 else: 360 # 361 # There's a NT to the left of the dot. 362 # Follow the tree pointer recursively (>1 363 # tree pointers from it indicates ambiguity). 364 # Since the item must have come about from a 365 # "completer" step, the state where the item 366 # came from must be the parent state of the 367 # item the tree pointer points to. 368 # 369 children = tree[want] 370 if len(children) > 1: 371 child = self.ambiguity(children) 372 else: 373 child = children[0] 374 375 tokpos = self.buildTree_r(stack, 376 tokens, tokpos, 377 tree, child) 378 pos = pos - 1 379 (crule, cpos, cparent), cstate = child 380 state = cparent 381 382 lhs, rhs = rule 383 result = self.rule2func[rule](stack[:len(rhs)]) 384 stack[:len(rhs)] = [result] 385 return tokpos
386
387 - def ambiguity(self, children):
388 # 389 # XXX - problem here and in collectRules() if the same 390 # rule appears in >1 method. But in that case the 391 # user probably gets what they deserve :-) Also 392 # undefined results if rules causing the ambiguity 393 # appear in the same method. 394 # 395 sortlist = [] 396 name2index = {} 397 for i in range(len(children)): 398 ((rule, pos, parent), index) = children[i] 399 lhs, rhs = rule 400 name = self.rule2name[rule] 401 sortlist.append((len(rhs), name)) 402 name2index[name] = i 403 sortlist.sort() 404 list = map(lambda (a,b): b, sortlist) 405 return children[name2index[self.resolve(list)]]
406
407 - def resolve(self, list):
408 # 409 # Resolve ambiguity in favor of the shortest RHS. 410 # Since we walk the tree from the top down, this 411 # should effectively resolve in favor of a "shift". 412 # 413 return list[0]
414 415 # 416 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree 417 # for a given input. The extra argument is a class (not an instance!) 418 # which supports the "__setslice__" and "__len__" methods. 419 # 420 # XXX - silently overrides any user code in methods. 421 # 422
423 -class GenericASTBuilder(GenericParser):
424 - def __init__(self, AST, start):
425 GenericParser.__init__(self, start) 426 self.AST = AST
427
428 - def preprocess(self, rule, func):
429 rebind = lambda lhs, self=self: \ 430 lambda args, lhs=lhs, self=self: \ 431 self.buildASTNode(args, lhs) 432 lhs, rhs = rule 433 return rule, rebind(lhs)
434
435 - def buildASTNode(self, args, lhs):
436 children = [] 437 for arg in args: 438 if isinstance(arg, self.AST): 439 children.append(arg) 440 else: 441 children.append(self.terminal(arg)) 442 return self.nonterminal(lhs, children)
443
444 - def terminal(self, token): return token
445
446 - def nonterminal(self, type, args):
447 rv = self.AST(type) 448 rv[:len(args)] = args 449 return rv
450 451 # 452 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For 453 # each node it attempts to invoke the method n_<node type>, falling 454 # back onto the default() method if the n_* can't be found. The preorder 455 # traversal also looks for an exit hook named n_<node type>_exit (no default 456 # routine is called if it's not found). To prematurely halt traversal 457 # of a subtree, call the prune() method -- this only makes sense for a 458 # preorder traversal. Node type is determined via the typestring() method. 459 # 460
461 -class GenericASTTraversalPruningException(object):
462 pass
463
464 -class GenericASTTraversal(object):
465 - def __init__(self, ast):
466 self.ast = ast
467
468 - def typestring(self, node):
469 return node.type
470
471 - def prune(self):
473
474 - def preorder(self, node=None):
475 if node is None: 476 node = self.ast 477 478 try: 479 name = 'n_' + self.typestring(node) 480 if hasattr(self, name): 481 func = getattr(self, name) 482 func(node) 483 else: 484 self.default(node) 485 except GenericASTTraversalPruningException: 486 return 487 488 for kid in node: 489 self.preorder(kid) 490 491 name = name + '_exit' 492 if hasattr(self, name): 493 func = getattr(self, name) 494 func(node)
495
496 - def postorder(self, node=None):
497 if node is None: 498 node = self.ast 499 500 for kid in node: 501 self.postorder(kid) 502 503 name = 'n_' + self.typestring(node) 504 if hasattr(self, name): 505 func = getattr(self, name) 506 func(node) 507 else: 508 self.default(node)
509 510
511 - def default(self, node):
512 pass
513 514 # 515 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" 516 # implemented. 517 # 518 # XXX - makes assumptions about how GenericParser walks the parse tree. 519 # 520
521 -class GenericASTMatcher(GenericParser):
522 - def __init__(self, start, ast):
523 GenericParser.__init__(self, start) 524 self.ast = ast
525
526 - def preprocess(self, rule, func):
527 rebind = lambda func, self=self: \ 528 lambda args, func=func, self=self: \ 529 self.foundMatch(args, func) 530 lhs, rhs = rule 531 rhslist = list(rhs) 532 rhslist.reverse() 533 534 return (lhs, tuple(rhslist)), rebind(func)
535
536 - def foundMatch(self, args, func):
537 func(args[-1]) 538 return args[-1]
539
540 - def match_r(self, node):
541 self.input.insert(0, node) 542 children = 0 543 544 for child in node: 545 if children == 0: 546 self.input.insert(0, '(') 547 children = children + 1 548 self.match_r(child) 549 550 if children > 0: 551 self.input.insert(0, ')')
552
553 - def match(self, ast=None):
554 if ast is None: 555 ast = self.ast 556 self.input = [] 557 558 self.match_r(ast) 559 self.parse(self.input)
560
561 - def resolve(self, list):
562 # 563 # Resolve ambiguity in favor of the longest RHS. 564 # 565 return list[-1]
566
567 -def _dump(tokens, states):
568 for i in range(len(states)): 569 print 'state', i 570 for (lhs, rhs), pos, parent in states[i]: 571 print '\t', lhs, '::=', 572 print ' '.join(rhs[:pos]), 573 print '.', 574 print ' '.join(rhs[pos:]), 575 print ',', parent 576 if i < len(tokens): 577 print 578 print 'token', str(tokens[i]) 579 print
580