1
2
3
4
5
6
7
8
9
10
11
12
13 from nltk_lite.semantics import logic
14 from cfg import *
15 from kimmo import kimmo
16
17 from featurelite import *
18 from copy import deepcopy
19 import yaml
20
21
23 """
24 Given a variable representation such as C{?x}, construct a corresponding
25 Variable object.
26 """
27 return Variable(varname[1:])
28
30 """
31 A C{Category} is a wrapper for feature dictionaries, intended for use in
32 parsing. It can act as a C{Nonterminal}.
33
34 A C{Category} acts like a dictionary, except in the following ways:
35
36 - Categories can be "frozen" so that they can act as hash keys;
37 before they are frozen, they are mutable.
38
39 - In addition to being hashable, Categories have consistent str()
40 representations.
41
42 - Categories have one feature marked as the 'head', which prints
43 differently than other features if it has a value. For example,
44 in the C{repr()} representation of a Category, the head goes to the
45 left, on the outside of the brackets. Subclasses of C{Category}
46 may change the feature name that is designated as the head, which is
47 _head by default.
48
49 Categories can contain any kind of object as their values, and can be
50 recursive and even re-entrant. Categories are not necessarily "categories
51 all the way down"; they can contain plain dictionaries as their values, and
52 converting inner dictionaries to categories would probably lead to messier
53 code for no gain.
54
55 Because Categories can contain any kind of object, they do not try to
56 keep control over what their inner objects do. If you freeze a Category
57 but mutate its inner objects, undefined behavior will occur.
58 """
59
60 headname = 'head'
61
62 - def __init__(self, features=None, **morefeatures):
63 if features is None: features = {}
64 self._features = unify(features, morefeatures)
65 self._hash = None
66 self._frozen = False
67 self._memostr = None
68
71
73 """
74 @return: A new Category based on this one, with its C{/} feature set to
75 C{other}.
76 """
77 return unify(self, {'/': other})
78
80 """
81 Compare Categories for equality. This relies on Python's built-in
82 __eq__ for dictionaries, which is fairly thorough in checking for
83 recursion and reentrance.
84
85 @return: True if C{self} and C{other} assign the same value to
86 every feature. In particular, return true if
87 C{self[M{p}]==other[M{p}]} for every feature path M{p} such
88 that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e.,
89 not a nested Category).
90 @rtype: C{bool}
91 """
92 if not other.__class__ == self.__class__: return False
93 return self._features == other._features
94
96 return not (self == other)
97
99 if self._hash is not None: return self._hash
100 return hash(str(self))
101
103 """
104 Freezing a Category memoizes its hash value, to make comparisons on it
105 faster. After freezing, the Category and all its values are immutable.
106
107 @return: self
108 """
109 self._memostr = str(self)
110 self._hash = hash(self)
111 self._frozen = True
112 return self
113
115 """
116 Returns whether this Category is frozen (immutable).
117
118 @rtype: C{bool}
119 """
120 return self._frozen
121
122 - def get(self, key):
123 return self._features.get(key)
124
126 return self._features.get(key)
127
129 if self._frozen: raise "Cannot modify a frozen Category"
130 self._features[key] = value
131
133 return self._features.items()
134
136 return self._features.keys()
137
139 return self._features.values()
140
143
145 """
146 @return: The node value corresponding to this C{Category}.
147 @rtype: C{Category}
148 """
149 return self
150
152 """
153 @return: The head of this category (the value shown outside the
154 brackets in its string representation). If there is no head, returns
155 None.
156 @rtype: C{str} or C{None}
157 """
158 return self.get(self.__class__.headname)
159
161 """
162 @return: A deep copy of C{self}.
163 """
164
165 return deepcopy(self)
166
168 """
169 @return: a list of all features that have values.
170 """
171 return self._features.keys()
172
173 has_feature = has_key
174
175
176
177
178
183
184 @staticmethod
191
192
193
194
195
197 """
198 @return: A string representation of this feature structure.
199 """
200 return str(self)
201
203 """
204 @return: A string representation of this feature structure.
205 """
206 if self._memostr is not None: return self._memostr
207 return self.__class__._str(self, {}, {})
208
209 @classmethod
210 - def _str(cls, obj, reentrances, reentrance_ids):
211 segments = []
212
213 keys = obj.keys()
214 keys.sort()
215 for fname in keys:
216 if fname == cls.headname: continue
217 fval = obj[fname]
218 if isinstance(fval, bool):
219 if fval: segments.append('+%s' % fname)
220 else: segments.append('-%s' % fname)
221 elif not isinstance(fval, dict):
222 segments.append('%s=%r' % (fname, fval))
223 else:
224 fval_repr = cls._str(fval, reentrances, reentrance_ids)
225 segments.append('%s=%s' % (fname, fval_repr))
226
227 head = obj.get(cls.headname)
228 if head is None: head = ''
229 if head and not len(segments): return str(head)
230 return '%s[%s]' % (head, ', '.join(segments))
231
232 yaml_tag = '!parse.Category'
233
234 @classmethod
238
239 @classmethod
241 features = loader.construct_mapping(node, deep=True)
242 return cls(features)
243
244
245
246
247
248
249 _PARSE_RE = {'name': re.compile(r'\s*([^\s\(\)"\'=,\[\]/\?]+)\s*'),
250 'ident': re.compile(r'\s*\((\d+)\)\s*'),
251 'reentrance': re.compile(r'\s*->\s*'),
252 'assign': re.compile(r'\s*=?\s*'),
253 'bracket': re.compile(r'\s*]\s*'),
254 'comma': re.compile(r'\s*,\s*'),
255 'none': re.compile(r'None(?=\s|\]|,)'),
256 'int': re.compile(r'-?\d+(?=\s|\]|,)'),
257 'var': re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*'+'|'+
258 r'\?<[a-zA-Z_][a-zA-Z0-9_]*'+
259 r'(=[a-zA-Z_][a-zA-Z0-9_]*)*>'),
260 'symbol': re.compile(r'\w+'),
261 'stringmarker': re.compile("['\"\\\\]"),
262
263 'categorystart':re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/\?]*)\s*\['),
264 'bool': re.compile(r'\s*([-\+])'),
265 'arrow': re.compile(r'\s*->\s*'),
266 'disjunct': re.compile(r'\s*\|\s*'),
267 'whitespace': re.compile(r'\s*'),
268 'semantics': re.compile(r'<([^>]+)>'),
269 'application': re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)>'),
270 'slash': re.compile(r'\s*/\s*'),
271 }
272
273 @classmethod
275 parsed, position = cls._parse(s, 0)
276 if position != len(s):
277 raise ValueError('end of string', position)
278 return cls(parsed)
279
280 @classmethod
285
286 @classmethod
287 - def _parse(cls, s, position=0, reentrances=None):
288 """
289 Helper function that parses a Category.
290 @param s: The string to parse.
291 @param position: The position in the string to start parsing.
292 @param reentrances: A dictionary from reentrance ids to values.
293 @return: A tuple (val, pos) of the feature structure created
294 by parsing and the position where the parsed feature
295 structure ends.
296 """
297
298 _PARSE_RE = cls._PARSE_RE
299
300 features = {}
301
302
303 match = _PARSE_RE['name'].match(s, position)
304 if match is not None:
305 features[cls.headname] = match.group(1)
306 position = match.end()
307 else:
308 match = _PARSE_RE['var'].match(s, position)
309 if match is not None:
310 features[cls.headname] = makevar(match.group(0))
311 position = match.end()
312
313
314
315
316 if position < len(s) and s[position] == '[':
317 position += 1
318
319
320
321
322
323
324 while True:
325 if not position < len(s):
326 raise ValueError('close bracket', position)
327
328
329 name = target = val = None
330
331
332 match = _PARSE_RE['bracket'].match(s, position)
333 if match is not None:
334 position = match.end()
335 break
336
337
338 match = _PARSE_RE['bool'].match(s, position)
339 if match is not None:
340 if match.group(1) == '+': val = True
341 else: val = False
342 position = match.end()
343
344
345 match = _PARSE_RE['name'].match(s, position)
346 if match is None: raise ValueError('feature name', position)
347 name = match.group(1)
348 position = match.end()
349
350
351 if val is None:
352 match = _PARSE_RE['assign'].match(s, position)
353 if match is None: raise ValueError('equals sign', position)
354 position = match.end()
355
356 val, position = cls._parseval(s, position, reentrances)
357 features[name] = val
358
359
360 match = _PARSE_RE['bracket'].match(s, position)
361 if match is not None:
362 position = match.end()
363 break
364
365
366 match = _PARSE_RE['comma'].match(s, position)
367 if match is None: raise ValueError('comma', position)
368 position = match.end()
369
370 return features, position
371
372 @classmethod
373 - def _parseval(cls, s, position, reentrances):
374 """
375 Helper function that parses a feature value. Currently
376 supports: None, bools, integers, variables, strings, nested feature
377 structures.
378 @param s: The string to parse.
379 @param position: The position in the string to start parsing.
380 @param reentrances: A dictionary from reentrance ids to values.
381 @return: A tuple (val, pos) of the value created by parsing
382 and the position where the parsed value ends.
383 """
384
385
386 _PARSE_RE = cls._PARSE_RE
387
388
389 if position == len(s): raise ValueError('value', position)
390
391
392 match = _PARSE_RE['application'].match(s, position)
393 if match is not None:
394 fun = ParserSubstitute(match.group(2)).next()
395 arg = ParserSubstitute(match.group(3)).next()
396 return ApplicationExpressionSubst(fun, arg), match.end()
397
398
399 match = _PARSE_RE['semantics'].match(s, position)
400 if match is not None:
401 return ParserSubstitute(match.group(1)).next(), match.end()
402
403
404 if s[position] in "'\"":
405 start = position
406 quotemark = s[position:position+1]
407 position += 1
408 while 1:
409 match = cls._PARSE_RE['stringmarker'].search(s, position)
410 if not match: raise ValueError('close quote', position)
411 position = match.end()
412 if match.group() == '\\': position += 1
413 elif match.group() == quotemark:
414 return s[start+1:position-1], position
415
416
417 if _PARSE_RE['categorystart'].match(s, position) is not None:
418 return cls._parse(s, position, reentrances)
419
420
421 match = _PARSE_RE['var'].match(s, position)
422 if match is not None:
423 return makevar(match.group()), match.end()
424
425
426 match = _PARSE_RE['none'].match(s, position)
427 if match is not None:
428 return None, match.end()
429
430
431 match = _PARSE_RE['int'].match(s, position)
432 if match is not None:
433 return int(match.group()), match.end()
434
435
436 match = _PARSE_RE['symbol'].match(s, position)
437 if match is not None:
438 return match.group(), match.end()
439
440
441 raise ValueError('value', position)
442
443 @classmethod
445 """
446 Parse a L{CFG} line involving C{Categories}. A line has this form:
447
448 C{lhs -> rhs | rhs | ...}
449
450 where C{lhs} is a Category, and each C{rhs} is a sequence of
451 Categories.
452
453 @returns: a list of C{Productions}, one for each C{rhs}.
454 """
455 _PARSE_RE = cls._PARSE_RE
456 position = 0
457 try:
458 lhs, position = cls.inner_parse(s, position)
459 lhs = cls(lhs)
460 except ValueError, e:
461 estr = ('Error parsing field structure\n\n\t' +
462 s + '\n\t' + ' '*e.args[1] + '^ ' +
463 'Expected %s\n' % e.args[0])
464 raise ValueError, estr
465 lhs.freeze()
466
467 match = _PARSE_RE['arrow'].match(s, position)
468 if match is None:
469 raise ValueError('expected arrow', s, s[position:])
470 else: position = match.end()
471 rules = []
472 while position < len(s):
473 rhs = []
474 while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None:
475 try:
476 val, position = cls.inner_parse(s, position, {})
477 if isinstance(val, dict): val = cls(val)
478 except ValueError, e:
479 estr = ('Error parsing field structure\n\n\t' +
480 s + '\n\t' + ' '*e.args[1] + '^ ' +
481 'Expected %s\n' % e.args[0])
482 raise ValueError, estr
483 if isinstance(val, Category): val.freeze()
484 rhs.append(val)
485 position = _PARSE_RE['whitespace'].match(s, position).end()
486 rules.append(Production(lhs, rhs))
487
488 if position < len(s):
489 match = _PARSE_RE['disjunct'].match(s, position)
490 position = match.end()
491
492
493
494 if len(rules) == 0: rules = [Production(lhs, ())]
495 return rules
496
498 """
499 A class of C{Category} for use in parsing.
500
501 The name of the head feature in a C{GrammarCategory} is C{pos} (for "part
502 of speech").
503
504 In addition, GrammarCategories are displayed and parse differently, to be
505 consistent with NLP teaching materials: the value of the C{/} feature can
506 be written with a slash after the right bracket, so that the string
507 representation looks like: C{head[...]/value}.
508
509 Every GrammarCategory has a / feature implicitly present; if it is not
510 explicitly written, it has the value False. This is so that "slashed"
511 features cannot unify with "unslashed" ones.
512
513 An example of a C{GrammarCategory} is C{VP[+fin]/NP}, for a verb phrase
514 that is finite and has an omitted noun phrase inside it.
515 """
516
517 headname = 'pos'
518 yaml_tag = '!parse.GrammarCategory'
519
520 @classmethod
521 - def _str(cls, obj, reentrances, reentrance_ids):
522 segments = []
523
524 keys = obj.keys()
525 keys.sort()
526 for fname in keys:
527 if fname == cls.headname: continue
528 if isinstance(obj, GrammarCategory) and fname == '/': continue
529 fval = obj[fname]
530 if isinstance(fval, bool):
531 if fval: segments.append('+%s' % fname)
532 else: segments.append('-%s' % fname)
533 elif not isinstance(fval, dict):
534 segments.append('%s=%r' % (fname, fval))
535 else:
536 fval_repr = cls._str(fval, reentrances, reentrance_ids)
537 segments.append('%s=%s' % (fname, fval_repr))
538
539 if segments: features = '[%s]' % ', '.join(segments)
540 else: features = ''
541 head = obj.get(cls.headname)
542 if head is None: head = ''
543 slash = None
544 if isinstance(obj, GrammarCategory): slash = obj.get('/')
545 if not slash: slash = ''
546 else:
547 if isinstance(slash, dict):
548 slash = '/%s' % cls._str(slash, reentrances, reentrance_ids)
549 else:
550 slash = '/%r' % slash
551
552
553 return '%s%s%s' % (head, features, slash)
554
555 @staticmethod
556 - def parse(s, position=0):
558
559 @classmethod
561 if reentrances is None: reentrances = {}
562 if s[position] in "'\"":
563 start = position
564 quotemark = s[position:position+1]
565 position += 1
566 while 1:
567 match = cls._PARSE_RE['stringmarker'].search(s, position)
568 if not match: raise ValueError('close quote', position)
569 position = match.end()
570 if match.group() == '\\': position += 1
571 elif match.group() == quotemark:
572 return s[start+1:position-1], position
573
574 body, position = GrammarCategory._parse(s, position, reentrances)
575 slash_match = Category._PARSE_RE['slash'].match(s, position)
576 if slash_match is not None:
577 position = slash_match.end()
578 slash, position = GrammarCategory._parseval(s, position, reentrances)
579 if isinstance(slash, basestring): slash = {'pos': slash}
580 body['/'] = unify(body.get('/'), slash)
581 elif not body.has_key('/'):
582 body['/'] = False
583 return cls(body), position
584
586 """
587 An interface for classes that can perform substitutions for feature
588 variables.
589 """
591 """
592 @return: The object that is obtained by replacing
593 each variable bound by C{bindings} with its values.
594 @rtype: (any)
595 """
596 raise NotImplementedError
597
599 """
600 A lambda calculus expression parser, extended to create application
601 expressions which support the SubstituteBindingsI interface.
602 """
605
607 """
608 A lambda application expression, extended to implement the
609 SubstituteBindingsI interface.
610 """
612 newval = self
613 for semvar in self.variables():
614 varstr = str(semvar)
615
616 if varstr.startswith('?'):
617 var = makevar(varstr)
618 if bindings.is_bound(var):
619 newval = newval.replace(semvar, bindings.lookup(var))
620 return newval
621
622
623
624
625
632
634 return Grammar(self.start, self.grammatical_productions +\
635 self.lexical_productions)
636
638 return Grammar(self.start, self.grammatical_productions)
639
646 return lookup
647
652 return lookup
653
661
692
698
699 @staticmethod
704
705 yaml.add_representer(Category, Category.to_yaml)
706 yaml.add_representer(GrammarCategory, GrammarCategory.to_yaml)
707
709 print "Category(pos='n', agr=dict(number='pl', gender='f')):"
710 print
711 print Category(pos='n', agr=dict(number='pl', gender='f'))
712 print repr(Category(pos='n', agr=dict(number='pl', gender='f')))
713 print
714 print "GrammarCategory.parse('NP/NP'):"
715 print
716 print GrammarCategory.parse('NP/NP')
717 print repr(GrammarCategory.parse('NP/NP'))
718 print
719 print "GrammarCategory.parse('?x/?x'):"
720 print
721 print GrammarCategory.parse('?x/?x')
722 print repr(GrammarCategory.parse('?x/?x'))
723 print
724 print "GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'):"
725 print
726 print GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')
727 print repr(GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'))
728 print
729 g = GrammarFile.read_file("speer.cfg")
730 print g.grammar()
731
732 if __name__ == '__main__':
733 demo()
734