Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package kimmo :: Module rules
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.kimmo.rules

  1  from nltk_lite.parse import Tree 
  2  from fsa import FSA 
  3  from nltk_lite import tokenize 
  4  from pairs import KimmoPair, sort_subsets 
  5  from copy import deepcopy 
  6  import re, yaml 
  7   
  8  _kimmo_terminal_regexp    = '[a-zA-Z0-9\+\'\-\#\@\$\%\!\^\`\}\{]+' # \}\{\<\>\,\.\~ # (^|\s)?\*(\s|$) !!! * is already covered in the re tokenizer 
  9  _kimmo_terminal_regexp_fsa    = '[^:\s]+' # for FSA, only invalid chars are whitespace and : 
 10                                            # '[a-zA-Z0-9\+\'\-\#\@\$\%\!\^\`\}\{\<\>\,\.\~\*]+' 
 11  _kimmo_terminal_regexp_ext= '~?' + _kimmo_terminal_regexp 
 12   
 13  _kimmo_defaults           = _kimmo_terminal_regexp + '|\:' 
 14  _kimmo_defaults_fsa       = _kimmo_terminal_regexp_fsa + '|\:' 
 15  _kimmo_rule               = _kimmo_terminal_regexp_ext + '|[\:\(\)\[\]\?\&\*\_]|<=>|==>|<==|/<=' 
 16  _arrows = ['==>', '<=>', '<==', '/<='] 
 17   
 18   
 19  _special_tokens = ['(', ')', '[', ']', '*', '&', '_', ':'] 
 20  _special_tokens.extend(_arrows) 
 21  _non_list_initial_special_tokens = [')', ']', '*', '&', '_', ':'] 
 22  _non_list_initial_special_tokens.extend(_arrows) 
 23  epsilon = None 
 24   
25 -class KimmoFSARule(object):
26 """ 27 A rule for two-level morphology, expressed as a deterministic finite 28 automaton. 29 """
30 - def __init__(self, name, fsa, subsets):
31 self._name = name 32 self._fsa = fsa 33 self._pairs = set() 34 self._subsets = subsets 35 for (start, pair, finish) in self._fsa.generate_transitions(): 36 self._pairs.add(pair)
37
38 - def fsa(self): return self._fsa
39 - def pairs(self): return self._pairs
40 - def name(self): return self._name
41
42 - def show_pygraph(self, root=None):
43 return self.fsa().show_pygraph(self.name(), root=root)
44
45 - def complete_fsa(self, fsa, fail_state=None):
46 fsa = deepcopy(fsa) 47 if fail_state is None: 48 fail_state = fsa.add_state('Fail') 49 fsa.insert('Fail', KimmoPair.make('@'), 'Fail') 50 sorted_pairs = sort_subsets(self._pairs, self._subsets) 51 for state in fsa.states(): 52 trans = fsa._transitions[state] 53 for pair in self._pairs: 54 if pair not in trans: 55 for sp in sorted_pairs: 56 if sp in trans and sp.includes(pair, self._subsets): 57 trans[pair] = trans[sp] 58 break 59 trans[pair] = [fail_state] 60 if trans[pair] == []: trans[pair] = [fail_state] 61 fsa._build_reverse_transitions() 62 return fsa
63 64 @staticmethod
65 - def parse_table(name, table, subsets):
66 lines = table.split('\n') 67 if len(lines) < 4: 68 raise ValueError,\ 69 "Rule %s has too few lines to be an FSA table." % name 70 pairs1 = lines[1].strip().split() 71 pairs2 = lines[2].strip().split() 72 if len(pairs1) != len(pairs2): 73 raise ValueError,\ 74 "Rule %s has pair definitions that don't line up." % name 75 pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)] 76 finals = [] 77 fsa = FSA() 78 for line in lines[3:]: 79 line = line.strip() 80 if not line: continue 81 groups = re.match(r'(\w+)(\.|:)\s*(.*)', line) 82 if groups is None: 83 raise ValueError,\ 84 "Can't parse this line of the state table for rule %s:\n%s"\ 85 % (name, line) 86 state, char, morestates = groups.groups() 87 if fsa.start() == 0: fsa.set_start(state) 88 if char == ':': finals.append(state) 89 fsa.add_state(state) 90 morestates = morestates.split() 91 if len(morestates) != len(pairs): 92 raise ValueError,\ 93 "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\ 94 % (name, line, len(morestates), len(pairs)) 95 for pair, nextstate in zip(pairs, morestates): 96 fsa.insert_safe(state, pair, nextstate) 97 fsa.set_final(finals) 98 return KimmoFSARule(name, fsa, subsets)
99 100 @staticmethod
101 - def from_dfa_dict(name, states, subsets):
102 fsa = FSA() 103 pairs = set([KimmoPair.make('@')]) 104 for (statename, trans) in states.items(): 105 for label in trans: 106 if label != 'others': 107 pairs.add(KimmoPair.make(label)) 108 for (statename, trans) in states.items(): 109 parts = statename.split() 110 source = parts[-1] 111 if not parts[0].startswith('rej'): 112 fsa.add_final(source) 113 114 if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]: 115 fsa.set_start(source) 116 if source in ['start', 'Start']: 117 fsa.set_start(source) 118 119 used_pairs = set() 120 for label in trans: 121 if label != 'others': 122 used_pairs.add(KimmoPair.make(label)) 123 for label, target in trans.items(): 124 if label.lower() == 'others': 125 fsa.insert_safe(source, KimmoPair.make('@'), target) 126 for pair in pairs.difference(used_pairs): 127 fsa.insert_safe(source, pair, target) 128 else: 129 fsa.insert_safe(source, KimmoPair.make(label), target) 130 return KimmoFSARule(name, fsa, subsets)
131
132 -class KimmoArrowRule(KimmoFSARule):
133 - def arrow(self): return self._arrow
134 - def lhpair(self): return self._lhpair
135
136 - def __init__(self, name, description, subsets):
137 self._name = name 138 self._description = description 139 self._negated = False 140 self._pairs = set() 141 self._subsets = subsets 142 desc = list(tokenize.regexp(description, _kimmo_rule)) 143 self._parse(desc)
144
145 - def __repr__(self):
146 return '<KimmoArrowRule %s: %s>' % (self._name, self._description)
147
148 - def _parse(self, tokens):
149 (end_pair, tree) = self._parse_pair(tokens, 0) 150 lhpair = self._pair_from_tree(tree) 151 self._lhpair = lhpair 152 self._pairs.add(lhpair) 153 154 end_arrow = self._parse_arrow(tokens, end_pair) 155 (end_left, lfsa) = self._parse_context(tokens, end_arrow, True) 156 end_slot = self._parse_slot(tokens, end_left) 157 (end_right, rfsa) = self._parse_context(tokens, end_slot, False) 158 if not(end_right == len(tokens)): 159 raise ValueError('unidentified tokens') 160 161 self._left_fsa = lfsa 162 self._right_fsa = rfsa 163 self._merge_fsas()
164
165 - def _next_token(self, tokens, i, raise_error=False):
166 if i >= len(tokens): 167 if raise_error: 168 raise ValueError('ran off end of input') 169 else: 170 return None 171 return tokens[i]
172
173 - def _pair_from_tree(self, tree):
174 if (tree.node != 'Pair'): raise RuntimeException('expected Pair, got ' + str(tree)) 175 if len(tree) == 1: 176 return KimmoPair(tree[0], tree[0]) 177 else: 178 return KimmoPair(tree[0], tree[2])
179
180 - def _parse_pair(self, tokens, i):
181 # print 'parsing pair at ' + str(i) 182 t1 = self._next_token(tokens, i, True) 183 if t1 in _special_tokens: raise ValueError('expected identifier, not ' + t1) 184 t2 = t1 185 j = i + 1 186 if self._next_token(tokens, j) == ':': 187 t2 = self._next_token(tokens, j+1, True) 188 if t2 in _special_tokens: raise ValueError('expected identifier, not ' + t2) 189 j = j + 2 190 tree = Tree('Pair', tokens[i:j]) 191 else: 192 tree = Tree('Pair', [tokens[i]]) 193 #print str(self._pair_from_tree(tree)) + ' from ' + str(i) + ' to ' + str(j) 194 return (j, tree)
195 196
197 - def _parse_arrow(self, tokens, i):
198 self._arrow = self._next_token(tokens, i, True) 199 if not(self.arrow() in _arrows): 200 raise ValueError('expected arrow, not ' + self.arrow()) 201 #print 'arrow from ' + str(i) + ' to ' + str(i+1) 202 return i + 1
203 204
205 - def _parse_slot(self, tokens, i):
206 slot = self._next_token(tokens, i, True) 207 if slot != '_': 208 raise ValueError('expected _, not ' + slot) 209 # print 'slot from ' + str(i) + ' to ' + str(i+1) 210 return i + 1
211 212
213 - def _parse_context(self, tokens, i, reverse):
214 (j, tree) = self._parse_list(tokens, i) 215 if j == i: return (i, None) 216 217 sigma = set() 218 self._collect_alphabet(tree, sigma) 219 fsa = FSA(sigma) 220 final_state = self._build_fsa(fsa, fsa.start(), tree, reverse) 221 fsa.set_final([final_state]) 222 #fsa.pp() 223 dfa = fsa.dfa() 224 #dfa.pp() 225 dfa.prune() 226 #dfa.pp() 227 return (j, dfa)
228 229
230 - def _collect_alphabet(self, tree, sigma):
231 if tree.node == 'Pair': 232 pair = self._pair_from_tree(tree) 233 sigma.add(pair) 234 self._pairs.add(pair) 235 else: 236 for d in tree: self._collect_alphabet(d, sigma)
237 238
239 - def _parse_list(self, tokens, i, type='Cons'):
240 # print 'parsing list at ' + str(i) 241 t = self._next_token(tokens, i) 242 if t == None or t in _non_list_initial_special_tokens: 243 # print ' failing immediately ' 244 return (i, None) 245 (j, s) = self._parse_singleton(tokens, i) 246 (k, r) = self._parse_list(tokens, j, type) 247 # print (k,r) 248 if r == None: 249 # print ' returning (%d, %s)' % (j, s) 250 return (j, s) 251 tree = Tree(type, [s, r]) 252 # print ' returning (%d, %s)' % (k, tree) 253 return (k, tree)
254 255
256 - def _parse_singleton(self, tokens, i):
257 # print 'parsing singleton at ' + str(i) 258 t = self._next_token(tokens, i, True) 259 j = i 260 result = None 261 if t == '(': 262 (j, result) = self._parse_list(tokens, i + 1, 'Cons') 263 if result == None: raise ValueError('missing contents of (...)') 264 t = self._next_token(tokens, j, True) 265 if t != ')': raise ValueError('missing final parenthesis, instead found ' + t) 266 j = j + 1 267 elif t == '[': 268 (j, result) = self._parse_list(tokens, i + 1, 'Or') 269 if result == None: raise ValueError('missing contents of [...]') 270 t = self._next_token(tokens, j, True) 271 if t != ']': raise ValueError('missing final bracket, instead found ' + t) 272 j = j + 1 273 elif t in _special_tokens: 274 raise ValueError('expected identifier, found ' + t) 275 else: 276 (j, tree) = self._parse_pair(tokens, i) 277 result = tree 278 t = self._next_token(tokens, j) 279 if t in ['*', '&', '?']: 280 j = j + 1 281 result = Tree(t, [result]) 282 return (j, result)
283 284
285 - def _build_fsa(self, fsa, entry_node, tree, reverse):
286 if tree.node == 'Pair': 287 return self._build_terminal(fsa, entry_node, self._pair_from_tree(tree)) 288 elif tree.node == 'Cons': 289 return self._build_seq(fsa, entry_node, tree[0], tree[1], reverse) 290 elif tree.node == 'Or': 291 return self._build_or(fsa, entry_node, tree[0], tree[1], reverse) 292 elif tree.node == '*': 293 return self._build_star(fsa, entry_node, tree[0], reverse) 294 elif tree.node == '&': 295 return self._build_plus(fsa, entry_node, tree[0], reverse) 296 elif tree.node == '?': 297 return self._build_qmk(fsa, entry_node, tree[0], reverse) 298 else: 299 raise RuntimeError('unknown tree node'+tree.node)
300 301
302 - def _build_terminal(self, fsa, entry_node, terminal):
303 new_exit_node = fsa.new_state() 304 fsa.insert(entry_node, terminal, new_exit_node) 305 #print '_build_terminal(%d,%s) -> %d' % (entry_node, terminal, new_exit_node) 306 return new_exit_node
307 308
309 - def _build_plus(self, fsa, node, tree, reverse):
310 node1 = self._build_fsa(fsa, node, tree[0], reverse) 311 fsa.insert(node1, epsilon, node) 312 return node1
313 314
315 - def _build_qmk(self, fsa, node, tree, reverse):
316 node1 = fsa.new_state() 317 node2 = self._build_fsa(fsa, node1, tree, reverse) 318 node3 = fsa.new_state() 319 fsa.insert(node, epsilon, node1) 320 fsa.insert(node, epsilon, node3) 321 fsa.insert(node2, epsilon, node3) 322 return node3
323 324
325 - def _build_star(self, fsa, node, tree, reverse):
326 node1 = fsa.new_state() 327 node2 = self._build_fsa(fsa, node1, tree, reverse) 328 node3 = fsa.new_state() 329 fsa.insert(node, epsilon, node1) 330 fsa.insert(node, epsilon, node3) 331 fsa.insert(node2, epsilon, node1) 332 fsa.insert(node2, epsilon, node3) 333 return node3
334 335
336 - def _build_seq(self, fsa, node, tree0, tree1, reverse):
337 (d0, d1) = (tree0, tree1) 338 if reverse: (d0, d1) = (d1, d0) 339 node1 = self._build_fsa(fsa, node, d0, reverse) 340 node2 = self._build_fsa(fsa, node1, d1, reverse) 341 # print '_build_seq(%d,%s,%s) -> %d,%d' % (node, tree0, tree1, node1, node2) 342 return node2
343
344 - def _build_or(self, fsa, node, tree0, tree1, reverse):
345 node0 = fsa.new_state() 346 node1 = fsa.new_state() 347 node2 = self._build_fsa(fsa, node0, tree0, reverse) 348 node3 = self._build_fsa(fsa, node1, tree1, reverse) 349 node4 = fsa.new_state() 350 fsa.insert(node, epsilon, node0) 351 fsa.insert(node, epsilon, node1) 352 fsa.insert(node2, epsilon, node4) 353 fsa.insert(node3, epsilon, node4) 354 return node4
355
356 - def left_arrow(self):
357 working = deepcopy(self._left_fsa) 358 right = self._right_fsa.dfa() 359 lstates = left.states() 360 for state in lstates: 361 working.relabel_state(state, 'L%s' % state) 362 working.add_state('Trash') 363 for state in working.finals(): 364 workng.insert_safe(source, KimmoPair)
365
366 -def demo():
367 rule = KimmoArrowRule("elision-e", "e:0 <== CN u _ +:@ VO", {'@': 368 'aeiouhklmnpw', 'VO': 'aeiou', 'CN': 'hklmnpw'}) 369 print rule 370 print rule._left_fsa 371 print rule._right_fsa 372 print 373 print rule._fsa
374 375 if __name__ == '__main__': 376 demo() 377 378 # vim:et:ts=4:sts=4:sw=4: 379