1
2
3
4
5
6
7
8
9 from types import *
10 from nltk_lite import tokenize
11 from nltk_lite.parse import *
12 import string
13
14
15
16
18 """
19 A simple top-down CFG parser that parses texts by recursively
20 expanding the fringe of a C{Tree}, and matching it against a
21 text.
22
23 C{RecursiveDescent} uses a list of tree locations called a
24 X{frontier} to remember which subtrees have not yet been expanded
25 and which leaves have not yet been matched against the text. Each
26 tree location consists of a list of child indices specifying the
27 path from the root of the tree to a subtree or a leaf; see the
28 reference documentation for C{Tree} for more information
29 about tree locations.
30
31 When the parser begins parsing a text, it constructs a tree
32 containing only the start symbol, and a frontier containing the
33 location of the tree's root node. It then extends the tree to
34 cover the text, using the following recursive procedure:
35
36 - If the frontier is empty, and the text is covered by the tree,
37 then return the tree as a possible parse.
38 - If the frontier is empty, and the text is not covered by the
39 tree, then return no parses.
40 - If the first element of the frontier is a subtree, then
41 use CFG productions to X{expand} it. For each applicable
42 production, add the expanded subtree's children to the
43 frontier, and recursively find all parses that can be
44 generated by the new tree and frontier.
45 - If the first element of the frontier is a token, then X{match}
46 it against the next token from the text. Remove the token
47 from the frontier, and recursively find all parses that can be
48 generated by the new tree and frontier.
49
50 @see: C{nltk.cfg}
51 """
53 """
54 Create a new C{RecursiveDescent}, that uses C{grammar}
55 to parse texts.
56
57 @type grammar: C{Grammar}
58 @param grammar: The grammar used to parse texts.
59 @type trace: C{int}
60 @param trace: The level of tracing that should be used when
61 parsing a text. C{0} will generate no tracing output;
62 and higher numbers will produce more verbose tracing
63 output.
64 """
65 self._grammar = grammar
66 self._trace = trace
67 AbstractParse.__init__(self)
68
84
85 - def _parse(self, remaining_text, tree, frontier):
86 """
87 Recursively expand and match each elements of C{tree}
88 specified by C{frontier}, to cover C{remaining_text}. Return
89 a list of all parses found.
90
91 @return: A list of all parses that can be generated by
92 matching and expanding the elements of C{tree}
93 specified by C{frontier}.
94 @rtype: C{list} of C{Tree}
95 @type tree: C{Tree}
96 @param tree: A partial structure for the text that is
97 currently being parsed. The elements of C{tree}
98 that are specified by C{frontier} have not yet been
99 expanded or matched.
100 @type remaining_text: C{list} of C{String}s
101 @param remaining_text: The portion of the text that is not yet
102 covered by C{tree}.
103 @type frontier: C{list} of C{tuple} of C{int}
104 @param frontier: A list of the locations within C{tree} of
105 all subtrees that have not yet been expanded, and all
106 leaves that have not yet been matched. This list sorted
107 in left-to-right order of location within the tree.
108 """
109
110
111
112 if len(remaining_text) == 0 and len(frontier) == 0:
113 if self._trace:
114 self._trace_succeed(tree, frontier)
115 return [tree]
116
117
118 elif len(frontier) == 0:
119 if self._trace:
120 self._trace_backtrack(tree, frontier)
121 return []
122
123
124 elif isinstance(tree[frontier[0]], Tree):
125 return self._expand(remaining_text, tree, frontier)
126
127
128 else:
129 return self._match(remaining_text, tree, frontier)
130
131 - def _match(self, rtext, tree, frontier):
132 """
133 @rtype: C{list} of C{Tree}
134 @return: a list of all parses that can be generated by
135 matching the first element of C{frontier} against the
136 first token in C{rtext}. In particular, if the first
137 element of C{frontier} has the same type as the first
138 token in C{rtext}, then substitute the token into
139 C{tree}; and return all parses that can be generated by
140 matching and expanding the remaining elements of
141 C{frontier}. If the first element of C{frontier} does not
142 have the same type as the first token in C{rtext}, then
143 return empty list.
144
145 @type tree: C{Tree}
146 @param tree: A partial structure for the text that is
147 currently being parsed. The elements of C{tree}
148 that are specified by C{frontier} have not yet been
149 expanded or matched.
150 @type rtext: C{list} of C{String}s
151 @param rtext: The portion of the text that is not yet
152 covered by C{tree}.
153 @type frontier: C{list} of C{tuple} of C{int}
154 @param frontier: A list of the locations within C{tree} of
155 all subtrees that have not yet been expanded, and all
156 leaves that have not yet been matched.
157 """
158
159 tree_leaf = tree[frontier[0]]
160 if (len(rtext) > 0 and tree_leaf == rtext[0]):
161
162
163 newtree = tree.copy(deep=True)
164 newtree[frontier[0]] = rtext[0]
165 if self._trace:
166 self._trace_match(newtree, frontier[1:], rtext[0])
167 return self._parse(rtext[1:], newtree, frontier[1:])
168 else:
169
170 if self._trace:
171 self._trace_backtrack(tree, frontier, rtext[:1])
172 return []
173
174 - def _expand(self, remaining_text, tree, frontier, production=None):
175 """
176 @rtype: C{list} of C{Tree}
177 @return: A list of all parses that can be generated by
178 expanding the first element of C{frontier} with
179 C{production}. In particular, if the first element of
180 C{frontier} is a subtree whose node type is equal to
181 C{production}'s left hand side, then add a child to that
182 subtree for each element of C{production}'s right hand
183 side; and return all parses that can be generated by
184 matching and expanding the remaining elements of
185 C{frontier}. If the first element of C{frontier} is not a
186 subtree whose node type is equal to C{production}'s left
187 hand side, then return an empty list. If C{production} is
188 not specified, then return a list of all parses that can
189 be generated by expanding the first element of C{frontier}
190 with I{any} CFG production.
191
192 @type tree: C{Tree}
193 @param tree: A partial structure for the text that is
194 currently being parsed. The elements of C{tree}
195 that are specified by C{frontier} have not yet been
196 expanded or matched.
197 @type remaining_text: C{list} of C{String}s
198 @param remaining_text: The portion of the text that is not yet
199 covered by C{tree}.
200 @type frontier: C{list} of C{tuple} of C{int}
201 @param frontier: A list of the locations within C{tree} of
202 all subtrees that have not yet been expanded, and all
203 leaves that have not yet been matched.
204 """
205
206 if production is None: productions = self._grammar.productions()
207 else: productions = [production]
208
209 parses = []
210 for production in productions:
211 lhs = production.lhs().symbol()
212 if lhs == tree[frontier[0]].node:
213 subtree = self._production_to_tree(production)
214 if frontier[0] == ():
215 newtree = subtree
216 else:
217 newtree = tree.copy(deep=True)
218 newtree[frontier[0]] = subtree
219 new_frontier = [frontier[0]+(i,) for i in
220 range(len(production.rhs()))]
221 if self._trace:
222 self._trace_expand(newtree, new_frontier, production)
223 parses += self._parse(remaining_text, newtree,
224 new_frontier + frontier[1:])
225 return parses
226
228 """
229 @rtype: C{Tree}
230 @return: The C{Tree} that is licensed by C{production}.
231 In particular, given the production::
232
233 C{[M{lhs} -> M{elt[1]} ... M{elt[n]}]}
234
235 Return a tree token that has a node C{M{lhs}.symbol}, and
236 C{M{n}} children. For each nonterminal element
237 C{M{elt[i]}} in the production, the tree token has a
238 childless subtree with node value C{M{elt[i]}.symbol}; and
239 for each terminal element C{M{elt[j]}}, the tree token has
240 a leaf token with type C{M{elt[j]}}.
241
242 @param production: The CFG production that licenses the tree
243 token that should be returned.
244 @type production: C{Production}
245 """
246 children = []
247 for elt in production.rhs():
248 if isinstance(elt, cfg.Nonterminal):
249 children.append(Tree(elt.symbol(), []))
250 else:
251
252 children.append(elt)
253 return Tree(production.lhs().symbol(), children)
254
255 - def trace(self, trace=2):
256 """
257 Set the level of tracing output that should be generated when
258 parsing a text.
259
260 @type trace: C{int}
261 @param trace: The trace level. A trace level of C{0} will
262 generate no tracing output; and higher trace levels will
263 produce more verbose tracing output.
264 @rtype: C{None}
265 """
266 self._trace = trace
267
269 """
270 Print trace output displaying the fringe of C{tree}. The
271 fringe of C{tree} consists of all of its leaves and all of
272 its childless subtrees.
273
274 @rtype: C{None}
275 """
276
277 if treeloc == (): print "*",
278 if isinstance(tree, Tree):
279 if len(tree) == 0: print `cfg.Nonterminal(tree.node)`,
280 for i in range(len(tree)):
281 if treeloc is not None and i == treeloc[0]:
282 self._trace_fringe(tree[i], treeloc[1:])
283 else:
284 self._trace_fringe(tree[i])
285 else:
286 print `tree`,
287
289 """
290 Print trace output displaying the parser's current state.
291
292 @param operation: A character identifying the operation that
293 generated the current state.
294 @rtype: C{None}
295 """
296 if self._trace == 2: print ' %c [' % operation,
297 else: print ' [',
298 if len(frontier) > 0: self._trace_fringe(tree, frontier[0])
299 else: self._trace_fringe(tree)
300 print ']'
301
306
308 if self._trace > 2: print 'Expand: %s' % production
309 if self._trace > 1: self._trace_tree(tree, frontier, 'E')
310
314
316 if self._trace > 2: print 'GOOD PARSE:'
317 if self._trace == 1: print 'Found a parse:\n%s' % tree
318 if self._trace > 1: self._trace_tree(tree, frontier, '+')
319
321 if self._trace > 2:
322 if toks: print 'Backtrack: %r match failed' % toks[0]
323 else: print 'Backtrack'
324
325
326
327
329 """
330 A C{RecursiveDescent} that allows you to step through the
331 parsing process, performing a single operation at a time.
332
333 The C{initialize} method is used to start parsing a text.
334 C{expand} expands the first element on the frontier using a single
335 CFG production, and C{match} matches the first element on the
336 frontier against the next text token. C{backtrack} undoes the most
337 recent expand or match operation. C{step} performs a single
338 expand, match, or backtrack operation. C{parses} returns the set
339 of parses that have been found by the parser.
340
341 @ivar _history: A list of C{(rtext, tree, frontier)} tripples,
342 containing the previous states of the parser. This history is
343 used to implement the C{backtrack} operation.
344 @ivar _tried_e: A record of all productions that have been tried
345 for a given tree. This record is used by C{expand} to perform
346 the next untried production.
347 @ivar _tried_m: A record of what tokens have been matched for a
348 given tree. This record is used by C{step} to decide whether
349 or not to match a token.
350 @see: C{nltk.cfg}
351 """
353 self._grammar = grammar
354 self._trace = trace
355 self._rtext = None
356 self._tree = None
357 self._frontier = [()]
358 self._tried_e = {}
359 self._tried_m = {}
360 self._history = []
361 self._parses = []
362 AbstractParse.__init__(self)
363
364
365
371
373 tokens = list(tokens)
374 self.initialize(tokens)
375 while self.step() is not None: pass
376
377 return self.parses()
378
380 """
381 Start parsing a given text. This sets the parser's tree to
382 the start symbol, its frontier to the root node, and its
383 remaining text to C{token['SUBTOKENS']}.
384 """
385
386 self._rtext = tokens
387 start = self._grammar.start().symbol()
388 self._tree = Tree(start, [])
389 self._frontier = [()]
390 self._tried_e = {}
391 self._tried_m = {}
392 self._history = []
393 self._parses = []
394 if self._trace:
395 self._trace_start(self._tree, self._frontier, self._rtext)
396
397 - def remaining_text(self):
398 """
399 @return: The portion of the text that is not yet covered by the
400 tree.
401 @rtype: C{list} of C{String}
402 """
403 return self._rtext
404
406 """
407 @return: A list of the tree locations of all subtrees that
408 have not yet been expanded, and all leaves that have not
409 yet been matched.
410 @rtype: C{list} of C{tuple} of C{int}
411 """
412 return self._frontier
413
415 """
416 @return: A partial structure for the text that is
417 currently being parsed. The elements specified by the
418 frontier have not yet been expanded or matched.
419 @rtype: C{Tree}
420 """
421 return self._tree
422
424 """
425 Perform a single parsing operation. If an untried match is
426 possible, then perform the match, and return the matched
427 token. If an untried expansion is possible, then perform the
428 expansion, and return the production that it is based on. If
429 backtracking is possible, then backtrack, and return 1.
430 Otherwise, return 0.
431
432 @return: 0 if no operation was performed; a token if a match
433 was performed; a production if an expansion was performed;
434 and 1 if a backtrack operation was performed.
435 @rtype: C{Production} or C{String} or C{boolean}
436 """
437
438 if self.untried_match():
439 token = self.match()
440 if token is not None: return token
441
442
443 production = self.expand()
444 if production is not None: return production
445
446
447 if self.backtrack():
448 self._trace_backtrack(self._tree, self._frontier)
449 return 1
450
451
452 return None
453
454 - def expand(self, production=None):
455 """
456 Expand the first element of the frontier. In particular, if
457 the first element of the frontier is a subtree whose node type
458 is equal to C{production}'s left hand side, then add a child
459 to that subtree for each element of C{production}'s right hand
460 side. If C{production} is not specified, then use the first
461 untried expandable production. If all expandable productions
462 have been tried, do nothing.
463
464 @return: The production used to expand the frontier, if an
465 expansion was performed. If no expansion was performed,
466 return C{None}.
467 @rtype: C{Production} or C{None}
468 """
469
470
471 if len(self._frontier) == 0:
472 return None
473 if not isinstance(self._tree[self._frontier[0]], Tree):
474 return None
475
476
477 if production is None:
478 productions = self.untried_expandable_productions()
479 else: productions = [production]
480
481 parses = []
482 for prod in productions:
483
484 self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
485
486
487 if self._expand(self._rtext, self._tree, self._frontier, prod):
488 return prod
489
490
491 return None
492
494 """
495 Match the first element of the frontier. In particular, if
496 the first element of the frontier has the same type as the
497 next text token, then substitute the text token into the tree.
498
499 @return: The token matched, if a match operation was
500 performed. If no match was performed, return C{None}
501 @rtype: C{String} or C{None}
502 """
503
504
505 tok = self._rtext[0]
506 self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
507
508
509 if len(self._frontier) == 0:
510 return None
511 if isinstance(self._tree[self._frontier[0]], Tree):
512 return None
513
514 if self._match(self._rtext, self._tree, self._frontier):
515
516 return self._history[-1][0][0]
517 else:
518 return None
519
521 """
522 Return the parser to its state before the most recent
523 match or expand operation. Calling C{undo} repeatedly return
524 the parser to successively earlier states. If no match or
525 expand operations have been performed, C{undo} will make no
526 changes.
527
528 @return: true if an operation was successfully undone.
529 @rtype: C{boolean}
530 """
531 if len(self._history) == 0: return 0
532 (self._rtext, self._tree, self._frontier) = self._history.pop()
533 return 1
534
536 """
537 @return: A list of all the productions for which expansions
538 are available for the current parser state.
539 @rtype: C{list} of C{Production}
540 """
541
542 if len(self._frontier) == 0: return []
543 frontier_child = self._tree[self._frontier[0]]
544 if (len(self._frontier) == 0 or
545 not isinstance(frontier_child, Tree)):
546 return []
547
548 return [p for p in self._grammar.productions()
549 if p.lhs().symbol() == frontier_child.node]
550
552 """
553 @return: A list of all the untried productions for which
554 expansions are available for the current parser state.
555 @rtype: C{list} of C{Production}
556 """
557
558 tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
559 return [p for p in self.expandable_productions()
560 if p not in tried_expansions]
561
563 """
564 @return: Whether the first element of the frontier is a token
565 that has not yet been matched.
566 @rtype: C{boolean}
567 """
568
569 if len(self._rtext) == 0: return 0
570 tried_matches = self._tried_m.get(self._freeze(self._tree), [])
571 return (self._rtext[0] not in tried_matches)
572
574 """
575 @return: Whether the parser's current state represents a
576 complete parse.
577 @rtype: C{boolean}
578 """
579 return (len(self._frontier) == 0 and len(self._rtext) == 0)
580
581 - def _parse(self, remaining_text, tree, frontier):
582 """
583 A stub version of C{_parse} that sets the parsers current
584 state to the given arguments. In C{RecursiveDescent},
585 the C{_parse} method is used to recursively continue parsing a
586 text. C{SteppingRecursiveDescent} overrides it to
587 capture these recursive calls. It records the parser's old
588 state in the history (to allow for backtracking), and updates
589 the parser's new state using the given arguments. Finally, it
590 returns C{[1]}, which is used by C{match} and C{expand} to
591 detect whether their operations were successful.
592
593 @return: C{[1]}
594 @rtype: C{list} of C{int}
595 """
596 self._history.append( (self._rtext, self._tree, self._frontier) )
597 self._rtext = remaining_text
598 self._tree = tree
599 self._frontier = frontier
600
601
602 if (len(frontier) == 0 and len(remaining_text) == 0):
603 self._parses.append(tree)
604 self._trace_succeed(self._tree, self._frontier)
605
606 return [1]
607
609 """
610 @return: A list of the parses that have been found by this
611 parser so far.
612 @rtype: C{list} of C{Tree}
613 """
614 return self._parses
615
616
617
618
620 """
621 Change the grammar used to parse texts.
622
623 @param grammar: The new grammar.
624 @type grammar: C{CFG}
625 """
626 self._grammar = grammar
627
628
629
630
631
659
660 if __name__ == '__main__': demo()
661