Package Bio :: Package Parsers :: Module spark
[hide private]
[frames] | no frames]

Source Code for Module Bio.Parsers.spark

  1  #  Copyright (c) 1998-2000 John Aycock 
  2  #   
  3  #  Permission is hereby granted, free of charge, to any person obtaining 
  4  #  a copy of this software and associated documentation files (the 
  5  #  "Software"), to deal in the Software without restriction, including 
  6  #  without limitation the rights to use, copy, modify, merge, publish, 
  7  #  distribute, sublicense, and/or sell copies of the Software, and to 
  8  #  permit persons to whom the Software is furnished to do so, subject to 
  9  #  the following conditions: 
 10  #   
 11  #  The above copyright notice and this permission notice shall be 
 12  #  included in all copies or substantial portions of the Software. 
 13  #   
 14  #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 15  #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 16  #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
 17  #  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
 18  #  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 19  #  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 20  #  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
 21   
 22  __version__ = 'SPARK-0.6.1' 
 23   
 24  import re 
 25  import sys 
 26   
27 -def _namelist(instance):
28 namelist, namedict, classlist = [], {}, [instance.__class__] 29 for c in classlist: 30 for b in c.__bases__: 31 classlist.append(b) 32 for name in dir(c): 33 if name not in namedict: 34 namelist.append(name) 35 namedict[name] = 1 36 return namelist
37
38 -class GenericScanner:
39 - def __init__(self):
40 pattern = self.reflect() 41 self.re = re.compile(pattern, re.VERBOSE) 42 43 self.index2func = {} 44 for name, number in self.re.groupindex.items(): 45 self.index2func[number-1] = getattr(self, 't_' + name)
46
47 - def makeRE(self, name):
48 doc = getattr(self, name).__doc__ 49 rv = '(?P<%s>%s)' % (name[2:], doc) 50 return rv
51
52 - def reflect(self):
53 rv = [] 54 for name in _namelist(self): 55 if name[:2] == 't_' and name != 't_default': 56 rv.append(self.makeRE(name)) 57 58 rv.append(self.makeRE('t_default')) 59 return '|'.join(rv)
60
61 - def error(self, s, pos):
62 print "Lexical error at position %s" % pos 63 raise SystemExit
64
65 - def tokenize(self, s):
66 pos = 0 67 n = len(s) 68 while pos < n: 69 m = self.re.match(s, pos) 70 if m is None: 71 self.error(s, pos) 72 73 groups = m.groups() 74 for i in range(len(groups)): 75 if groups[i] and i in self.index2func: 76 self.index2func[i](groups[i]) 77 pos = m.end()
78
79 - def t_default(self, s):
80 r'( . | \n )+' 81 pass
82
83 -class GenericParser:
84 - def __init__(self, start):
85 self.rules = {} 86 self.rule2func = {} 87 self.rule2name = {} 88 self.collectRules() 89 self.startRule = self.augment(start) 90 self.ruleschanged = 1
91 92 _START = 'START' 93 _EOF = 'EOF' 94 95 # 96 # A hook for GenericASTBuilder and GenericASTMatcher. 97 #
98 - def preprocess(self, rule, func): return rule, func
99
100 - def addRule(self, doc, func):
101 rules = doc.split() 102 103 index = [] 104 for i in range(len(rules)): 105 if rules[i] == '::=': 106 index.append(i-1) 107 index.append(len(rules)) 108 109 for i in range(len(index)-1): 110 lhs = rules[index[i]] 111 rhs = rules[index[i]+2:index[i+1]] 112 rule = (lhs, tuple(rhs)) 113 114 rule, fn = self.preprocess(rule, func) 115 116 if lhs in self.rules: 117 self.rules[lhs].append(rule) 118 else: 119 self.rules[lhs] = [ rule ] 120 self.rule2func[rule] = fn 121 self.rule2name[rule] = func.__name__[2:] 122 self.ruleschanged = 1
123
124 - def collectRules(self):
125 for name in _namelist(self): 126 if name[:2] == 'p_': 127 func = getattr(self, name) 128 doc = func.__doc__ 129 self.addRule(doc, func)
130
131 - def augment(self, start):
132 # 133 # Tempting though it is, this isn't made into a call 134 # to self.addRule() because the start rule shouldn't 135 # be subject to preprocessing. 136 # 137 startRule = (self._START, ( start, self._EOF )) 138 self.rule2func[startRule] = lambda args: args[0] 139 self.rules[self._START] = [ startRule ] 140 self.rule2name[startRule] = '' 141 return startRule
142
143 - def makeFIRST(self):
144 union = {} 145 self.first = {} 146 147 for rulelist in self.rules.values(): 148 for lhs, rhs in rulelist: 149 if lhs not in self.first: 150 self.first[lhs] = {} 151 152 if len(rhs) == 0: 153 self.first[lhs][None] = 1 154 continue 155 156 sym = rhs[0] 157 if sym not in self.rules: 158 self.first[lhs][sym] = 1 159 else: 160 union[(sym, lhs)] = 1 161 changes = 1 162 while changes: 163 changes = 0 164 for src, dest in union.keys(): 165 destlen = len(self.first[dest]) 166 self.first[dest].update(self.first[src]) 167 if len(self.first[dest]) != destlen: 168 changes = 1
169 170 # 171 # An Earley parser, as per J. Earley, "An Efficient Context-Free 172 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, 173 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, 174 # Carnegie-Mellon University, August 1968, p. 27. 175 # 176
177 - def typestring(self, token):
178 return None
179
180 - def error(self, token):
181 print "Syntax error at or near `%s' token" % token 182 raise SystemExit
183
184 - def parse(self, tokens):
185 tree = {} 186 tokens.append(self._EOF) 187 states = { 0: [ (self.startRule, 0, 0) ] } 188 189 if self.ruleschanged: 190 self.makeFIRST() 191 192 for i in xrange(len(tokens)): 193 states[i+1] = [] 194 195 if states[i] == []: 196 break 197 self.buildState(tokens[i], states, i, tree) 198 199 #_dump(tokens, states) 200 201 if i < len(tokens)-1 or states[i+1] != [(self.startRule, 2, 0)]: 202 del tokens[-1] 203 self.error(tokens[i-1]) 204 rv = self.buildTree(tokens, tree, ((self.startRule, 2, 0), i+1)) 205 del tokens[-1] 206 return rv
207
208 - def buildState(self, token, states, i, tree):
209 needsCompletion = {} 210 state = states[i] 211 predicted = {} 212 213 for item in state: 214 rule, pos, parent = item 215 lhs, rhs = rule 216 217 # 218 # A -> a . (completer) 219 # 220 if pos == len(rhs): 221 if len(rhs) == 0: 222 needsCompletion[lhs] = (item, i) 223 224 for pitem in states[parent]: 225 if pitem is item: 226 break 227 228 prule, ppos, pparent = pitem 229 plhs, prhs = prule 230 231 if prhs[ppos:ppos+1] == (lhs,): 232 new = (prule, 233 ppos+1, 234 pparent) 235 if new not in state: 236 state.append(new) 237 tree[(new, i)] = [(item, i)] 238 else: 239 tree[(new, i)].append((item, i)) 240 continue 241 242 nextSym = rhs[pos] 243 244 # 245 # A -> a . B (predictor) 246 # 247 if nextSym in self.rules: 248 # 249 # Work on completer step some more; for rules 250 # with empty RHS, the "parent state" is the 251 # current state we're adding Earley items to, 252 # so the Earley items the completer step needs 253 # may not all be present when it runs. 254 # 255 if nextSym in needsCompletion: 256 new = (rule, pos+1, parent) 257 olditem_i = needsCompletion[nextSym] 258 if new not in state: 259 state.append(new) 260 tree[(new, i)] = [olditem_i] 261 else: 262 tree[(new, i)].append(olditem_i) 263 264 # 265 # Has this been predicted already? 266 # 267 if nextSym in predicted: 268 continue 269 predicted[nextSym] = 1 270 271 ttype = token is not self._EOF and \ 272 self.typestring(token) or \ 273 None 274 if ttype is not None: 275 # 276 # Even smarter predictor, when the 277 # token's type is known. The code is 278 # grungy, but runs pretty fast. Three 279 # cases are looked for: rules with 280 # empty RHS; first symbol on RHS is a 281 # terminal; first symbol on RHS is a 282 # nonterminal (and isn't nullable). 283 # 284 for prule in self.rules[nextSym]: 285 new = (prule, 0, i) 286 prhs = prule[1] 287 if len(prhs) == 0: 288 state.append(new) 289 continue 290 prhs0 = prhs[0] 291 if prhs0 not in self.rules: 292 if prhs0 != ttype: 293 continue 294 else: 295 state.append(new) 296 continue 297 first = self.first[prhs0] 298 if None not in first and \ 299 ttype not in first: 300 continue 301 state.append(new) 302 continue 303 304 for prule in self.rules[nextSym]: 305 # 306 # Smarter predictor, as per Grune & 307 # Jacobs' _Parsing Techniques_. Not 308 # as good as FIRST sets though. 309 # 310 prhs = prule[1] 311 if len(prhs) > 0 and \ 312 prhs[0] not in self.rules and \ 313 token != prhs[0]: 314 continue 315 state.append((prule, 0, i)) 316 317 # 318 # A -> a . c (scanner) 319 # 320 elif token == nextSym: 321 #assert new not in states[i+1] 322 states[i+1].append((rule, pos+1, parent))
323
324 - def buildTree(self, tokens, tree, root):
325 stack = [] 326 self.buildTree_r(stack, tokens, -1, tree, root) 327 return stack[0]
328
329 - def buildTree_r(self, stack, tokens, tokpos, tree, root):
330 (rule, pos, parent), state = root 331 332 while pos > 0: 333 want = ((rule, pos, parent), state) 334 if want not in tree: 335 # 336 # Since pos > 0, it didn't come from closure, 337 # and if it isn't in tree[], then there must 338 # be a terminal symbol to the left of the dot. 339 # (It must be from a "scanner" step.) 340 # 341 pos = pos - 1 342 state = state - 1 343 stack.insert(0, tokens[tokpos]) 344 tokpos = tokpos - 1 345 else: 346 # 347 # There's a NT to the left of the dot. 348 # Follow the tree pointer recursively (>1 349 # tree pointers from it indicates ambiguity). 350 # Since the item must have come about from a 351 # "completer" step, the state where the item 352 # came from must be the parent state of the 353 # item the tree pointer points to. 354 # 355 children = tree[want] 356 if len(children) > 1: 357 child = self.ambiguity(children) 358 else: 359 child = children[0] 360 361 tokpos = self.buildTree_r(stack, 362 tokens, tokpos, 363 tree, child) 364 pos = pos - 1 365 (crule, cpos, cparent), cstate = child 366 state = cparent 367 368 lhs, rhs = rule 369 result = self.rule2func[rule](stack[:len(rhs)]) 370 stack[:len(rhs)] = [result] 371 return tokpos
372
373 - def ambiguity(self, children):
374 # 375 # XXX - problem here and in collectRules() if the same 376 # rule appears in >1 method. But in that case the 377 # user probably gets what they deserve :-) Also 378 # undefined results if rules causing the ambiguity 379 # appear in the same method. 380 # 381 sortlist = [] 382 name2index = {} 383 for i in range(len(children)): 384 ((rule, pos, parent), index) = children[i] 385 lhs, rhs = rule 386 name = self.rule2name[rule] 387 sortlist.append((len(rhs), name)) 388 name2index[name] = i 389 sortlist.sort() 390 list = map(lambda (a,b): b, sortlist) 391 return children[name2index[self.resolve(list)]]
392
393 - def resolve(self, list):
394 # 395 # Resolve ambiguity in favor of the shortest RHS. 396 # Since we walk the tree from the top down, this 397 # should effectively resolve in favor of a "shift". 398 # 399 return list[0]
400 401 # 402 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree 403 # for a given input. The extra argument is a class (not an instance!) 404 # which supports the "__setslice__" and "__len__" methods. 405 # 406 # XXX - silently overrides any user code in methods. 407 # 408
409 -class GenericASTBuilder(GenericParser):
410 - def __init__(self, AST, start):
411 GenericParser.__init__(self, start) 412 self.AST = AST
413
414 - def preprocess(self, rule, func):
415 rebind = lambda lhs, self=self: \ 416 lambda args, lhs=lhs, self=self: \ 417 self.buildASTNode(args, lhs) 418 lhs, rhs = rule 419 return rule, rebind(lhs)
420
421 - def buildASTNode(self, args, lhs):
422 children = [] 423 for arg in args: 424 if isinstance(arg, self.AST): 425 children.append(arg) 426 else: 427 children.append(self.terminal(arg)) 428 return self.nonterminal(lhs, children)
429
430 - def terminal(self, token): return token
431
432 - def nonterminal(self, type, args):
433 rv = self.AST(type) 434 rv[:len(args)] = args 435 return rv
436 437 # 438 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For 439 # each node it attempts to invoke the method n_<node type>, falling 440 # back onto the default() method if the n_* can't be found. The preorder 441 # traversal also looks for an exit hook named n_<node type>_exit (no default 442 # routine is called if it's not found). To prematurely halt traversal 443 # of a subtree, call the prune() method -- this only makes sense for a 444 # preorder traversal. Node type is determined via the typestring() method. 445 # 446
447 -class GenericASTTraversalPruningException:
448 pass
449
450 -class GenericASTTraversal:
451 - def __init__(self, ast):
452 self.ast = ast
453
454 - def typestring(self, node):
455 return node.type
456
457 - def prune(self):
459
460 - def preorder(self, node=None):
461 if node is None: 462 node = self.ast 463 464 try: 465 name = 'n_' + self.typestring(node) 466 if hasattr(self, name): 467 func = getattr(self, name) 468 func(node) 469 else: 470 self.default(node) 471 except GenericASTTraversalPruningException: 472 return 473 474 for kid in node: 475 self.preorder(kid) 476 477 name = name + '_exit' 478 if hasattr(self, name): 479 func = getattr(self, name) 480 func(node)
481
482 - def postorder(self, node=None):
483 if node is None: 484 node = self.ast 485 486 for kid in node: 487 self.postorder(kid) 488 489 name = 'n_' + self.typestring(node) 490 if hasattr(self, name): 491 func = getattr(self, name) 492 func(node) 493 else: 494 self.default(node)
495 496
497 - def default(self, node):
498 pass
499 500 # 501 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" 502 # implemented. 503 # 504 # XXX - makes assumptions about how GenericParser walks the parse tree. 505 # 506
507 -class GenericASTMatcher(GenericParser):
508 - def __init__(self, start, ast):
509 GenericParser.__init__(self, start) 510 self.ast = ast
511
512 - def preprocess(self, rule, func):
513 rebind = lambda func, self=self: \ 514 lambda args, func=func, self=self: \ 515 self.foundMatch(args, func) 516 lhs, rhs = rule 517 rhslist = list(rhs) 518 rhslist.reverse() 519 520 return (lhs, tuple(rhslist)), rebind(func)
521
522 - def foundMatch(self, args, func):
523 func(args[-1]) 524 return args[-1]
525
526 - def match_r(self, node):
527 self.input.insert(0, node) 528 children = 0 529 530 for child in node: 531 if children == 0: 532 self.input.insert(0, '(') 533 children = children + 1 534 self.match_r(child) 535 536 if children > 0: 537 self.input.insert(0, ')')
538
539 - def match(self, ast=None):
540 if ast is None: 541 ast = self.ast 542 self.input = [] 543 544 self.match_r(ast) 545 self.parse(self.input)
546
547 - def resolve(self, list):
548 # 549 # Resolve ambiguity in favor of the longest RHS. 550 # 551 return list[-1]
552
553 -def _dump(tokens, states):
554 for i in range(len(states)): 555 print 'state', i 556 for (lhs, rhs), pos, parent in states[i]: 557 print '\t', lhs, '::=', 558 print ' '.join(rhs[:pos]), 559 print '.', 560 print ' '.join(rhs[pos:]), 561 print ',', parent 562 if i < len(tokens): 563 print 564 print 'token', str(tokens[i]) 565 print
566