Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package kimmo :: Module fsa
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.kimmo.fsa

  1  # Natural Language Toolkit: Finite State Automata 
  2  # 
  3  # Copyright (C) 2001-2006 University of Pennsylvania 
  4  # Authors: Steven Bird <sb@ldc.upenn.edu> 
  5  #          Rob Speer <rspeer@mit.edu> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  A module for finite state automata.  
 11  Operations are based on Aho, Sethi & Ullman (1986) Chapter 3. 
 12  """ 
 13   
 14  from nltk_lite import tokenize 
 15  from nltk_lite.parse.tree import Tree 
 16  from nltk_lite.parse import cfg, pcfg, pchart 
 17  import yaml 
 18   
 19  epsilon = None 
 20   
 21  # some helper functions 
 22   
 23  # TODO - check that parse was complete, and report error otherwise 
 24   
25 -class FSA(yaml.YAMLObject):
26 """ 27 A class for finite state automata. In general, it represents 28 nondetermnistic finite state automata, with DFAs being a special case. 29 """ 30 yaml_tag = '!FSA'
31 - def __init__(self, sigma='', transitions=None, start=0, finals=None):
32 """Set up the FSA. 33 34 @param sigma: the alphabet of the FSA 35 @type sigma: sequence 36 @param transitions: A dictionary representing the states and 37 transitions in the FSA. The keys are state identifiers (any hashable 38 object), and the values are dictionaries that map input symbols to the 39 sets of states they lead to. 40 @type transitions: dict 41 @param start: The identifier of the start state 42 @type start: hashable object 43 @param finals: The identifiers of the accept states 44 @type finals: sequence 45 """ 46 self._transitions = transitions or {} 47 self._start = start 48 self._reverse = {} 49 self._build_reverse_transitions() 50 if finals: self._finals = set(finals) 51 else: self._finals = set([0]) 52 self._sigma = set(sigma) 53 assert isinstance(self._transitions, dict) 54 self._next_state_num = 0
55
57 for state in self._transitions: 58 self._reverse.setdefault(state, {}) 59 for (state, symbol, target) in self.generate_transitions(): 60 self._add_transition(self._reverse, target, symbol, state)
61
62 - def generate_transitions(self):
63 """ 64 A generator that yields each transition arrow in the FSA in the form 65 (source, label, target). 66 """ 67 for (state, map) in self._transitions.items(): 68 for (symbol, targets) in map.items(): 69 for target in targets: 70 yield (state, symbol, target)
71
72 - def labels(self, s1, s2):
73 """ 74 A generator for all possible labels taking state s1 to state s2. 75 """ 76 map = self._transitions.get(s1, {}) 77 for (symbol, targets) in map.items(): 78 if s2 in targets: yield symbol
79
80 - def sigma(self):
81 "The alphabet of the FSA." 82 return self._sigma
83 alphabet = sigma 84
85 - def check_in_sigma(self, label):
86 "Check whether a given object is in the alphabet." 87 if label and label not in self._sigma: 88 raise ValueError('Label "%s" not in alphabet: %s' % (label, str(self._sigma)))
89
90 - def __len__(self):
91 "The number of states in the FSA." 92 return len(self._transitions)
93
94 - def new_state(self):
95 """ 96 Add a new state to the FSA. 97 @returns: the ID of the new state (a sequentially-assigned number). 98 @rtype: int 99 """ 100 while self._next_state_num in self._transitions: 101 self._next_state_num += 1 102 self._transitions[self._next_state_num] = {} 103 self._reverse[self._next_state_num] = {} 104 return self._next_state_num
105
106 - def add_state(self, name):
107 self._transitions[name] = {} 108 self._reverse[name] = {} 109 return name
110
111 - def start(self):
112 """ 113 @returns: the ID of the FSA's start state. 114 """ 115 return self._start
116
117 - def finals(self):
118 """ 119 @returns: the IDs of all accept states. 120 @rtype: set 121 """ 122 # was a tuple before 123 return self._finals
124
125 - def nonfinals(self):
126 """ 127 @returns: the IDs of all accept states. 128 @rtype: set 129 """ 130 # was a tuple before 131 return set(self.states()).difference(self._finals)
132
133 - def states(self):
134 """ 135 @returns: a list of all states in the FSA. 136 @rtype: list 137 """ 138 return self._transitions.keys()
139
140 - def add_final(self, state):
141 """ 142 Make a state into an accept state. 143 """ 144 self._finals.add(state)
145
146 - def delete_final(self, state):
147 """ 148 Make an accept state no longer be an accept state. 149 """ 150 self._finals = self._finals.difference(set([state]))
151 # del self._finals[state] 152
153 - def set_final(self, states):
154 """ 155 Set the list of accept states. 156 """ 157 self._finals = set(states)
158
159 - def set_start(self, start):
160 """ 161 Set the start state of the FSA. 162 """ 163 self._start = start
164
165 - def in_finals(self, list):
166 """ 167 Check whether a sequence contains any final states. 168 """ 169 return [state for state in list 170 if state in self.finals()] != []
171
172 - def insert_safe(self, s1, label, s2):
173 if s1 not in self.states(): 174 self.add_state(s1) 175 if s2 not in self.states(): 176 self.add_state(s2) 177 self.insert(s1, label, s2)
178
179 - def insert(self, s1, label, s2):
180 """ 181 Add a new transition to the FSA. 182 183 @param s1: the source of the transition 184 @param label: the element of the alphabet that labels the transition 185 @param s2: the destination of the transition 186 """ 187 if s1 not in self.states(): 188 raise ValueError, "State %s does not exist in %s" % (s1, 189 self.states()) 190 if s2 not in self.states(): 191 raise ValueError, "State %s does not exist in %s" % (s2, 192 self.states()) 193 self._add_transition(self._transitions, s1, label, s2) 194 self._add_transition(self._reverse, s2, label, s1)
195
196 - def _add_transition(self, map, s1, label, s2):
197 mapping = map[s1] 198 targets = mapping.setdefault(label, []) 199 targets.append(s2)
200
201 - def _del_transition(self, map, s1, label, s2):
202 mapping = map[s1] 203 targets = mapping.setdefault(label, []) 204 targets.remove(s2) 205 if len(targets) == 0: del mapping[label]
206
207 - def delete(self, s1, label, s2):
208 """ 209 Removes a transition from the FSA. 210 211 @param s1: the source of the transition 212 @param label: the element of the alphabet that labels the transition 213 @param s2: the destination of the transition 214 """ 215 if s1 not in self.states(): 216 raise ValueError, "State %s does not exist" % s1 217 if s2 not in self.states(): 218 raise ValueError, "State %s does not exist" % s1 219 self._del_transition(self._transitions, s1, label, s2) 220 self._del_transition(self._reverse, s2, label, s1)
221
222 - def delete_state(self, state):
223 "Removes a state and all its transitions from the FSA." 224 if state not in self.states(): 225 raise ValueError, "State %s does not exist" % state 226 for (s1, label, s2) in self.incident_transitions(state): 227 self.delete(s1, label, s2) 228 del self._transitions[state] 229 del self._reverse[state]
230
231 - def incident_transitions(self, state):
232 """ 233 @returns: a set of transitions into or out of a state. 234 @rtype: set 235 """ 236 result = set() 237 forward = self._transitions[state] 238 backward = self._reverse[state] 239 for label, targets in forward.items(): 240 for target in targets: 241 result.add((state, label, target)) 242 for label, targets in backward.items(): 243 for target in targets: 244 result.add((target, label, state)) 245 return result
246
247 - def relabel_state(self, old, new):
248 """ 249 Assigns a state a new identifier. 250 """ 251 if old not in self.states(): 252 raise ValueError, "State %s does not exist" % old 253 if new in self.states(): 254 raise ValueError, "State %s already exists" % new 255 changes = [] 256 for (s1, symbol, s2) in self.generate_transitions(): 257 if s1 == old and s2 == old: 258 changes.append((s1, symbol, s2, new, symbol, new)) 259 elif s1 == old: 260 changes.append((s1, symbol, s2, new, symbol, s2)) 261 elif s2 == old: 262 changes.append((s1, symbol, s2, s1, symbol, new)) 263 for (leftstate, symbol, rightstate, newleft, newsym, newright)\ 264 in changes: 265 print leftstate, symbol, rightstate, newleft, newsym, newright 266 self.delete(leftstate, symbol, rightstate) 267 self.insert_safe(newleft, newsym, newright) 268 del self._transitions[old] 269 del self._reverse[old]
270
271 - def next(self, state, symbol):
272 "The set of states reached from a certain state via a given symbol." 273 return self.e_closure(self._transitions[state].get(symbol, set()))
274 nextStates = next 275
276 - def move(self, states, symbol):
277 "The set of states reached from a set of states via a given symbol." 278 result = set() 279 for state in states: 280 result = result.union(self.next(state, symbol)) 281 return self.e_closure(result)
282
283 - def is_deterministic(self):
284 """ 285 Return whether this is a DFA 286 (every symbol leads from a state to at most one target state). 287 """ 288 for map in self._transitions.values(): 289 for targets in map.values(): 290 if len(targets) > 1: return False 291 return True
292
293 - def nextState(self, state, symbol):
294 """ 295 The single state reached from a state via a given symbol. 296 If there is more than one such state, raises a ValueError. 297 If there is no such state, returns None. 298 """ 299 next = self.next(state, symbol) 300 if len(next) > 1: 301 raise ValueError, "This FSA is nondeterministic -- use nextStates instead." 302 elif len(next) == 1: return list(next)[0] 303 else: return None
304
305 - def forward_traverse(self, state):
306 "All states reachable by following transitions from a given state." 307 result = set() 308 for (symbol, targets) in self._transitions[state].items(): 309 result = result.union(targets) 310 return result
311
312 - def reverse_traverse(self, state):
313 """All states from which a given state is reachable by following 314 transitions.""" 315 result = set() 316 for (symbol, targets) in self._reverse[state].items(): 317 result = result.union(targets) 318 return result
319
320 - def _forward_accessible(self, s1, visited):
321 for s2 in self.forward_traverse(s1): 322 if not s2 in visited: 323 visited.add(s2) 324 self._forward_accessible(s2, visited) 325 return visited
326
327 - def _reverse_accessible(self, s1, visited):
328 for s2 in self.reverse_traverse(s1): 329 if not s2 in visited: 330 visited.add(s2) 331 self._reverse_accessible(s2, visited) 332 return visited
333 334 # delete inaccessible nodes and unused transitions
335 - def prune(self):
336 """ 337 Modifies an FSA to remove inaccessible states and unused transitions. 338 """ 339 acc = self.accessible() 340 for state in self.states(): 341 if state not in acc: 342 self.delete_state(state) 343 else: 344 self._clean_map(self._transitions[state]) 345 self._clean_map(self._reverse[state])
346
347 - def _clean_map(self, map):
348 for (key, value) in map.items(): 349 if len(value) == 0: 350 del map[key]
351 352 # mark accessible nodes
353 - def accessible(self):
354 acc = set() 355 for final in self.finals(): 356 reverse_acc = set([final]) 357 self._reverse_accessible(final, reverse_acc) 358 acc = acc.union(reverse_acc) 359 360 forward_acc = set([self.start()]) 361 self._forward_accessible(self.start(), forward_acc) 362 363 acc = acc.intersection(forward_acc) 364 return acc
365
366 - def e_closure(self, states):
367 """ 368 Given a set of states, return the set of states reachable from 369 those states by following epsilon transitions. 370 371 @param states: the initial set of states 372 @type states: sequence 373 @returns: a superset of the given states, reachable by epsilon 374 transitions 375 @rtype: set 376 """ 377 stack = list(states) 378 closure = list(states) 379 while stack: 380 s1 = stack.pop() 381 for s2 in self.next(s1, epsilon): 382 if s2 not in closure: 383 closure.append(s2) 384 stack.append(s2) 385 return set(closure)
386 387 # return the corresponding DFA using subset construction (ASU p118) 388 # NB representation of (a*) still isn't minimal; should have 1 state not 2
389 - def dfa(self):
390 "Return a DFA that is equivalent to this FSA." 391 dfa = FSA(self.sigma()) 392 dfa_initial = dfa.start() 393 nfa_initial = tuple(self.e_closure((self.start(),))) 394 map = {} 395 map[dfa_initial] = nfa_initial 396 map[nfa_initial] = dfa_initial 397 if nfa_initial in self.finals(): 398 dfa.add_final(dfa_initial) 399 unmarked = [dfa_initial] 400 marked = [] 401 while unmarked: 402 dfa_state = unmarked.pop() 403 marked.append(dfa_state) 404 # is a final state accessible via epsilon transitions? 405 if self.in_finals(self.e_closure(map[dfa_state])): 406 dfa.add_final(dfa_state) 407 for label in self.sigma(): 408 nfa_next = tuple(self.e_closure(self.move(map[dfa_state], 409 label))) 410 if map.has_key(nfa_next): 411 dfa_next = map[nfa_next] 412 else: 413 dfa_next = dfa.new_state() 414 map[dfa_next] = nfa_next 415 map[nfa_next] = dfa_next 416 if self.in_finals(nfa_next): 417 dfa.add_final(dfa_next) 418 unmarked.append(dfa_next) 419 dfa.insert(dfa_state, label, dfa_next) 420 return dfa
421
422 - def generate(self, maxlen, state=0, prefix=""):
423 "Generate all accepting sequences of length at most maxlen." 424 if maxlen > 0: 425 if state in self._finals: 426 print prefix 427 for (s1, labels, s2) in self.outgoing_transitions(state): 428 for label in labels(): 429 self.generate(maxlen-1, s2, prefix+label)
430
431 - def pp(self):
432 """ 433 Print a representation of this FSA (in human-readable YAML format). 434 """ 435 print yaml.dump(self)
436 437 @classmethod
438 - def from_yaml(cls, loader, node):
439 map = loader.construct_mapping(node) 440 result = cls(map.get('sigma', []), {}, map.get('finals', [])) 441 for (s1, map1) in map['transitions'].items(): 442 for (symbol, targets) in map1.items(): 443 for s2 in targets: 444 result.insert(s1, symbol, s2) 445 return result
446 447 @classmethod
448 - def to_yaml(cls, dumper, data):
449 sigma = data.sigma() 450 transitions = {} 451 for (s1, symbol, s2) in data.generate_transitions(): 452 map1 = transitions.setdefault(s1, {}) 453 map2 = map1.setdefault(symbol, []) 454 map2.append(s2) 455 try: sigma = "".join(sigma) 456 except: sigma = list(sigma) 457 node = dumper.represent_mapping(cls.yaml_tag, dict( 458 sigma = sigma, 459 finals = list(data.finals()), 460 start = data._start, 461 transitions = transitions)) 462 return node
463
464 - def show_pygraph(self, title='FSA', outfile=None, labels=True, root=None):
465 from pygraph import pygraph, tkgraphview 466 graph = pygraph.Grapher('directed') 467 468 for state in self.states(): 469 color = '#eee' 470 if state in self.finals(): 471 shape = 'oval' 472 else: 473 shape = 'rect' 474 if state == self.start(): 475 color = '#afa' 476 term = '' 477 if state == self.start(): term = 'start' 478 elif state == 'End': term = 'end' 479 if state in [0, '0', 'reject', 'Reject']: color='#e99' 480 481 graph.addNode(state, state, color, shape, term) 482 483 #for source, trans in self._transitions.items(): 484 for source, label, target in self.generate_transitions(): 485 if not labels: label = '' 486 graph.addEdge(source, target, label, color='black', dup=False) 487 488 if outfile is None: outfile = title 489 490 return tkgraphview.tkGraphView(graph, title, outfile, root=root, 491 startTk=(not root))
492
493 - def __str__(self):
494 return yaml.dump(self)
495 496 ### FUNCTIONS TO BUILD FSA FROM REGEXP 497 498 # the grammar of regular expressions 499 # (probabilities ensure that unary operators 500 # have stronger associativity than juxtaposition) 501
502 -def grammar(terminals):
503 (S, Expr, Star, Plus, Qmk, Paren) = [cfg.Nonterminal(s) for s in 'SE*+?('] 504 rules = [pcfg.WeightedProduction(Expr, [Star], prob=0.2), 505 pcfg.WeightedProduction(Expr, [Plus], prob=0.2), 506 pcfg.WeightedProduction(Expr, [Qmk], prob=0.2), 507 pcfg.WeightedProduction(Expr, [Paren], prob=0.2), 508 pcfg.WeightedProduction(S, [Expr], prob=0.5), 509 pcfg.WeightedProduction(S, [S, Expr], prob=0.5), 510 pcfg.WeightedProduction(Star, [Expr, '*'], prob=1), 511 pcfg.WeightedProduction(Plus, [Expr, '+'], prob=1), 512 pcfg.WeightedProduction(Qmk, [Expr, '?'], prob=1), 513 pcfg.WeightedProduction(Paren, ['(', S, ')'], prob=1)] 514 515 prob_term = 0.2/len(terminals) # divide remaining pr. mass 516 for terminal in terminals: 517 rules.append(pcfg.WeightedProduction(Expr, [terminal], prob=prob_term)) 518 519 return pcfg.WeightedGrammar(S, rules)
520 521 _parser = pchart.InsideParse(grammar('abcde')) 522 523 # create NFA from regexp (Thompson's construction) 524 # assumes unique start and final states 525
526 -def re2nfa(fsa, re):
527 tokens = tokenize.regexp(re, pattern=r'.') 528 tree = _parser.parse(tokens) 529 if tree is None: raise ValueError('Bad Regexp') 530 state = re2nfa_build(fsa, fsa.start(), tree) 531 fsa.set_final([state])
532 # fsa.minimize() 533
534 -def re2nfa_build(fsa, node, tree):
535 # Terminals. 536 if not isinstance(tree, Tree): 537 return re2nfa_char(fsa, node, tree) 538 elif len(tree) == 1: 539 return re2nfa_build(fsa, node, tree[0]) 540 elif tree.node == '(': 541 return re2nfa_build(fsa, node, tree[1]) 542 elif tree.node == '*': return re2nfa_star(fsa, node, tree[0]) 543 elif tree.node == '+': return re2nfa_plus(fsa, node, tree[0]) 544 elif tree.node == '?': return re2nfa_qmk(fsa, node, tree[0]) 545 else: 546 node = re2nfa_build(fsa, node, tree[0]) 547 return re2nfa_build(fsa, node, tree[1])
548
549 -def re2nfa_char(fsa, node, char):
550 new = fsa.new_state() 551 fsa.insert(node, char, new) 552 return new
553
554 -def re2nfa_qmk(fsa, node, tree):
555 node1 = fsa.new_state() 556 node2 = re2nfa_build(fsa, node1, tree) 557 node3 = fsa.new_state() 558 fsa.insert(node, epsilon, node1) 559 fsa.insert(node, epsilon, node3) 560 fsa.insert(node2, epsilon, node3) 561 return node3
562
563 -def re2nfa_plus(fsa, node, tree):
564 node1 = re2nfa_build(fsa, node, tree[0]) 565 fsa.insert(node1, epsilon, node) 566 return node1
567
568 -def re2nfa_star(fsa, node, tree):
569 node1 = fsa.new_state() 570 node2 = re2nfa_build(fsa, node1, tree) 571 node3 = fsa.new_state() 572 fsa.insert(node, epsilon, node1) 573 fsa.insert(node, epsilon, node3) 574 fsa.insert(node2, epsilon, node1) 575 fsa.insert(node2, epsilon, node3) 576 return node3
577 578 ################################################################# 579 # Demonstration 580 ################################################################# 581
582 -def demo():
583 """ 584 A demonstration showing how FSAs can be created and used. 585 """ 586 # Define an alphabet. 587 alphabet = "abcd" 588 589 # Create a new FSA. 590 fsa = FSA(alphabet) 591 592 # Use a regular expression to initialize the FSA. 593 re = 'abcd' 594 print 'Regular Expression:', re 595 re2nfa(fsa, re) 596 print "NFA:" 597 fsa.pp() 598 599 # Convert the (nondeterministic) FSA to a deterministic FSA. 600 dfa = fsa.dfa() 601 print "DFA:" 602 dfa.pp() 603 604 # Prune the DFA 605 dfa.prune() 606 print "PRUNED DFA:" 607 dfa.pp()
608 609 # Use the FSA to generate all strings of length less than 3 610 # (broken) 611 #fsa.generate(3) 612 613 if __name__ == '__main__': demo() 614