Package Bio :: Package Parsers :: Module spark
[hide private]
[frames] | no frames]

Source Code for Module Bio.Parsers.spark

  1  #  Copyright (c) 1998-2000 John Aycock 
  2  #   
  3  #  Permission is hereby granted, free of charge, to any person obtaining 
  4  #  a copy of this software and associated documentation files (the 
  5  #  "Software"), to deal in the Software without restriction, including 
  6  #  without limitation the rights to use, copy, modify, merge, publish, 
  7  #  distribute, sublicense, and/or sell copies of the Software, and to 
  8  #  permit persons to whom the Software is furnished to do so, subject to 
  9  #  the following conditions: 
 10  #   
 11  #  The above copyright notice and this permission notice shall be 
 12  #  included in all copies or substantial portions of the Software. 
 13  #   
 14  #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 15  #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 16  #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
 17  #  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
 18  #  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 19  #  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 20  #  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
 21   
 22  __version__ = 'SPARK-0.6.1' 
 23   
 24  import re 
 25  import sys 
 26  import string 
 27   
28 -def _namelist(instance):
29 namelist, namedict, classlist = [], {}, [instance.__class__] 30 for c in classlist: 31 for b in c.__bases__: 32 classlist.append(b) 33 for name in dir(c): 34 if not namedict.has_key(name): 35 namelist.append(name) 36 namedict[name] = 1 37 return namelist
38
39 -class GenericScanner:
40 - def __init__(self):
41 pattern = self.reflect() 42 self.re = re.compile(pattern, re.VERBOSE) 43 44 self.index2func = {} 45 for name, number in self.re.groupindex.items(): 46 self.index2func[number-1] = getattr(self, 't_' + name)
47
48 - def makeRE(self, name):
49 doc = getattr(self, name).__doc__ 50 rv = '(?P<%s>%s)' % (name[2:], doc) 51 return rv
52
53 - def reflect(self):
54 rv = [] 55 for name in _namelist(self): 56 if name[:2] == 't_' and name != 't_default': 57 rv.append(self.makeRE(name)) 58 59 rv.append(self.makeRE('t_default')) 60 return string.join(rv, '|')
61
62 - def error(self, s, pos):
63 print "Lexical error at position %s" % pos 64 raise SystemExit
65
66 - def tokenize(self, s):
67 pos = 0 68 n = len(s) 69 while pos < n: 70 m = self.re.match(s, pos) 71 if m is None: 72 self.error(s, pos) 73 74 groups = m.groups() 75 for i in range(len(groups)): 76 if groups[i] and self.index2func.has_key(i): 77 self.index2func[i](groups[i]) 78 pos = m.end()
79
80 - def t_default(self, s):
81 r'( . | \n )+' 82 pass
83
84 -class GenericParser:
85 - def __init__(self, start):
86 self.rules = {} 87 self.rule2func = {} 88 self.rule2name = {} 89 self.collectRules() 90 self.startRule = self.augment(start) 91 self.ruleschanged = 1
92 93 _START = 'START' 94 _EOF = 'EOF' 95 96 # 97 # A hook for GenericASTBuilder and GenericASTMatcher. 98 #
99 - def preprocess(self, rule, func): return rule, func
100
101 - def addRule(self, doc, func):
102 rules = string.split(doc) 103 104 index = [] 105 for i in range(len(rules)): 106 if rules[i] == '::=': 107 index.append(i-1) 108 index.append(len(rules)) 109 110 for i in range(len(index)-1): 111 lhs = rules[index[i]] 112 rhs = rules[index[i]+2:index[i+1]] 113 rule = (lhs, tuple(rhs)) 114 115 rule, fn = self.preprocess(rule, func) 116 117 if self.rules.has_key(lhs): 118 self.rules[lhs].append(rule) 119 else: 120 self.rules[lhs] = [ rule ] 121 self.rule2func[rule] = fn 122 self.rule2name[rule] = func.__name__[2:] 123 self.ruleschanged = 1
124
125 - def collectRules(self):
126 for name in _namelist(self): 127 if name[:2] == 'p_': 128 func = getattr(self, name) 129 doc = func.__doc__ 130 self.addRule(doc, func)
131
132 - def augment(self, start):
133 # 134 # Tempting though it is, this isn't made into a call 135 # to self.addRule() because the start rule shouldn't 136 # be subject to preprocessing. 137 # 138 startRule = (self._START, ( start, self._EOF )) 139 self.rule2func[startRule] = lambda args: args[0] 140 self.rules[self._START] = [ startRule ] 141 self.rule2name[startRule] = '' 142 return startRule
143
144 - def makeFIRST(self):
145 union = {} 146 self.first = {} 147 148 for rulelist in self.rules.values(): 149 for lhs, rhs in rulelist: 150 if not self.first.has_key(lhs): 151 self.first[lhs] = {} 152 153 if len(rhs) == 0: 154 self.first[lhs][None] = 1 155 continue 156 157 sym = rhs[0] 158 if not self.rules.has_key(sym): 159 self.first[lhs][sym] = 1 160 else: 161 union[(sym, lhs)] = 1 162 changes = 1 163 while changes: 164 changes = 0 165 for src, dest in union.keys(): 166 destlen = len(self.first[dest]) 167 self.first[dest].update(self.first[src]) 168 if len(self.first[dest]) != destlen: 169 changes = 1
170 171 # 172 # An Earley parser, as per J. Earley, "An Efficient Context-Free 173 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, 174 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, 175 # Carnegie-Mellon University, August 1968, p. 27. 176 # 177
178 - def typestring(self, token):
179 return None
180
181 - def error(self, token):
182 print "Syntax error at or near `%s' token" % token 183 raise SystemExit
184
185 - def parse(self, tokens):
186 tree = {} 187 tokens.append(self._EOF) 188 states = { 0: [ (self.startRule, 0, 0) ] } 189 190 if self.ruleschanged: 191 self.makeFIRST() 192 193 for i in xrange(len(tokens)): 194 states[i+1] = [] 195 196 if states[i] == []: 197 break 198 self.buildState(tokens[i], states, i, tree) 199 200 #_dump(tokens, states) 201 202 if i < len(tokens)-1 or states[i+1] != [(self.startRule, 2, 0)]: 203 del tokens[-1] 204 self.error(tokens[i-1]) 205 rv = self.buildTree(tokens, tree, ((self.startRule, 2, 0), i+1)) 206 del tokens[-1] 207 return rv
208
209 - def buildState(self, token, states, i, tree):
210 needsCompletion = {} 211 state = states[i] 212 predicted = {} 213 214 for item in state: 215 rule, pos, parent = item 216 lhs, rhs = rule 217 218 # 219 # A -> a . (completer) 220 # 221 if pos == len(rhs): 222 if len(rhs) == 0: 223 needsCompletion[lhs] = (item, i) 224 225 for pitem in states[parent]: 226 if pitem is item: 227 break 228 229 prule, ppos, pparent = pitem 230 plhs, prhs = prule 231 232 if prhs[ppos:ppos+1] == (lhs,): 233 new = (prule, 234 ppos+1, 235 pparent) 236 if new not in state: 237 state.append(new) 238 tree[(new, i)] = [(item, i)] 239 else: 240 tree[(new, i)].append((item, i)) 241 continue 242 243 nextSym = rhs[pos] 244 245 # 246 # A -> a . B (predictor) 247 # 248 if self.rules.has_key(nextSym): 249 # 250 # Work on completer step some more; for rules 251 # with empty RHS, the "parent state" is the 252 # current state we're adding Earley items to, 253 # so the Earley items the completer step needs 254 # may not all be present when it runs. 255 # 256 if needsCompletion.has_key(nextSym): 257 new = (rule, pos+1, parent) 258 olditem_i = needsCompletion[nextSym] 259 if new not in state: 260 state.append(new) 261 tree[(new, i)] = [olditem_i] 262 else: 263 tree[(new, i)].append(olditem_i) 264 265 # 266 # Has this been predicted already? 267 # 268 if predicted.has_key(nextSym): 269 continue 270 predicted[nextSym] = 1 271 272 ttype = token is not self._EOF and \ 273 self.typestring(token) or \ 274 None 275 if ttype is not None: 276 # 277 # Even smarter predictor, when the 278 # token's type is known. The code is 279 # grungy, but runs pretty fast. Three 280 # cases are looked for: rules with 281 # empty RHS; first symbol on RHS is a 282 # terminal; first symbol on RHS is a 283 # nonterminal (and isn't nullable). 284 # 285 for prule in self.rules[nextSym]: 286 new = (prule, 0, i) 287 prhs = prule[1] 288 if len(prhs) == 0: 289 state.append(new) 290 continue 291 prhs0 = prhs[0] 292 if not self.rules.has_key(prhs0): 293 if prhs0 != ttype: 294 continue 295 else: 296 state.append(new) 297 continue 298 first = self.first[prhs0] 299 if not first.has_key(None) and \ 300 not first.has_key(ttype): 301 continue 302 state.append(new) 303 continue 304 305 for prule in self.rules[nextSym]: 306 # 307 # Smarter predictor, as per Grune & 308 # Jacobs' _Parsing Techniques_. Not 309 # as good as FIRST sets though. 310 # 311 prhs = prule[1] 312 if len(prhs) > 0 and \ 313 not self.rules.has_key(prhs[0]) and \ 314 token != prhs[0]: 315 continue 316 state.append((prule, 0, i)) 317 318 # 319 # A -> a . c (scanner) 320 # 321 elif token == nextSym: 322 #assert new not in states[i+1] 323 states[i+1].append((rule, pos+1, parent))
324
325 - def buildTree(self, tokens, tree, root):
326 stack = [] 327 self.buildTree_r(stack, tokens, -1, tree, root) 328 return stack[0]
329
330 - def buildTree_r(self, stack, tokens, tokpos, tree, root):
331 (rule, pos, parent), state = root 332 333 while pos > 0: 334 want = ((rule, pos, parent), state) 335 if not tree.has_key(want): 336 # 337 # Since pos > 0, it didn't come from closure, 338 # and if it isn't in tree[], then there must 339 # be a terminal symbol to the left of the dot. 340 # (It must be from a "scanner" step.) 341 # 342 pos = pos - 1 343 state = state - 1 344 stack.insert(0, tokens[tokpos]) 345 tokpos = tokpos - 1 346 else: 347 # 348 # There's a NT to the left of the dot. 349 # Follow the tree pointer recursively (>1 350 # tree pointers from it indicates ambiguity). 351 # Since the item must have come about from a 352 # "completer" step, the state where the item 353 # came from must be the parent state of the 354 # item the tree pointer points to. 355 # 356 children = tree[want] 357 if len(children) > 1: 358 child = self.ambiguity(children) 359 else: 360 child = children[0] 361 362 tokpos = self.buildTree_r(stack, 363 tokens, tokpos, 364 tree, child) 365 pos = pos - 1 366 (crule, cpos, cparent), cstate = child 367 state = cparent 368 369 lhs, rhs = rule 370 result = self.rule2func[rule](stack[:len(rhs)]) 371 stack[:len(rhs)] = [result] 372 return tokpos
373
374 - def ambiguity(self, children):
375 # 376 # XXX - problem here and in collectRules() if the same 377 # rule appears in >1 method. But in that case the 378 # user probably gets what they deserve :-) Also 379 # undefined results if rules causing the ambiguity 380 # appear in the same method. 381 # 382 sortlist = [] 383 name2index = {} 384 for i in range(len(children)): 385 ((rule, pos, parent), index) = children[i] 386 lhs, rhs = rule 387 name = self.rule2name[rule] 388 sortlist.append((len(rhs), name)) 389 name2index[name] = i 390 sortlist.sort() 391 list = map(lambda (a,b): b, sortlist) 392 return children[name2index[self.resolve(list)]]
393
394 - def resolve(self, list):
395 # 396 # Resolve ambiguity in favor of the shortest RHS. 397 # Since we walk the tree from the top down, this 398 # should effectively resolve in favor of a "shift". 399 # 400 return list[0]
401 402 # 403 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree 404 # for a given input. The extra argument is a class (not an instance!) 405 # which supports the "__setslice__" and "__len__" methods. 406 # 407 # XXX - silently overrides any user code in methods. 408 # 409
410 -class GenericASTBuilder(GenericParser):
411 - def __init__(self, AST, start):
412 GenericParser.__init__(self, start) 413 self.AST = AST
414
415 - def preprocess(self, rule, func):
416 rebind = lambda lhs, self=self: \ 417 lambda args, lhs=lhs, self=self: \ 418 self.buildASTNode(args, lhs) 419 lhs, rhs = rule 420 return rule, rebind(lhs)
421
422 - def buildASTNode(self, args, lhs):
423 children = [] 424 for arg in args: 425 if isinstance(arg, self.AST): 426 children.append(arg) 427 else: 428 children.append(self.terminal(arg)) 429 return self.nonterminal(lhs, children)
430
431 - def terminal(self, token): return token
432
433 - def nonterminal(self, type, args):
434 rv = self.AST(type) 435 rv[:len(args)] = args 436 return rv
437 438 # 439 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For 440 # each node it attempts to invoke the method n_<node type>, falling 441 # back onto the default() method if the n_* can't be found. The preorder 442 # traversal also looks for an exit hook named n_<node type>_exit (no default 443 # routine is called if it's not found). To prematurely halt traversal 444 # of a subtree, call the prune() method -- this only makes sense for a 445 # preorder traversal. Node type is determined via the typestring() method. 446 # 447
448 -class GenericASTTraversalPruningException:
449 pass
450
451 -class GenericASTTraversal:
452 - def __init__(self, ast):
453 self.ast = ast
454
455 - def typestring(self, node):
456 return node.type
457
458 - def prune(self):
460
461 - def preorder(self, node=None):
462 if node is None: 463 node = self.ast 464 465 try: 466 name = 'n_' + self.typestring(node) 467 if hasattr(self, name): 468 func = getattr(self, name) 469 func(node) 470 else: 471 self.default(node) 472 except GenericASTTraversalPruningException: 473 return 474 475 for kid in node: 476 self.preorder(kid) 477 478 name = name + '_exit' 479 if hasattr(self, name): 480 func = getattr(self, name) 481 func(node)
482
483 - def postorder(self, node=None):
484 if node is None: 485 node = self.ast 486 487 for kid in node: 488 self.postorder(kid) 489 490 name = 'n_' + self.typestring(node) 491 if hasattr(self, name): 492 func = getattr(self, name) 493 func(node) 494 else: 495 self.default(node)
496 497
498 - def default(self, node):
499 pass
500 501 # 502 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" 503 # implemented. 504 # 505 # XXX - makes assumptions about how GenericParser walks the parse tree. 506 # 507
508 -class GenericASTMatcher(GenericParser):
509 - def __init__(self, start, ast):
510 GenericParser.__init__(self, start) 511 self.ast = ast
512
513 - def preprocess(self, rule, func):
514 rebind = lambda func, self=self: \ 515 lambda args, func=func, self=self: \ 516 self.foundMatch(args, func) 517 lhs, rhs = rule 518 rhslist = list(rhs) 519 rhslist.reverse() 520 521 return (lhs, tuple(rhslist)), rebind(func)
522
523 - def foundMatch(self, args, func):
524 func(args[-1]) 525 return args[-1]
526
527 - def match_r(self, node):
528 self.input.insert(0, node) 529 children = 0 530 531 for child in node: 532 if children == 0: 533 self.input.insert(0, '(') 534 children = children + 1 535 self.match_r(child) 536 537 if children > 0: 538 self.input.insert(0, ')')
539
540 - def match(self, ast=None):
541 if ast is None: 542 ast = self.ast 543 self.input = [] 544 545 self.match_r(ast) 546 self.parse(self.input)
547
548 - def resolve(self, list):
549 # 550 # Resolve ambiguity in favor of the longest RHS. 551 # 552 return list[-1]
553
554 -def _dump(tokens, states):
555 for i in range(len(states)): 556 print 'state', i 557 for (lhs, rhs), pos, parent in states[i]: 558 print '\t', lhs, '::=', 559 print string.join(rhs[:pos]), 560 print '.', 561 print string.join(rhs[pos:]), 562 print ',', parent 563 if i < len(tokens): 564 print 565 print 'token', str(tokens[i]) 566 print
567