Package nltk_lite :: Package parse :: Module pcfg
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.pcfg

  1  # Natural Language Toolkit: Probabilistic Context Free Grammars 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
  5  #         Edward Loper <edloper@ldc.upenn.edu> (minor additions) 
  6  #         Nathan Bodenstab <bodenstab@cslu.ogi.edu> (induction) 
  7  # URL: <http://nltk.sf.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  import re 
 11  from nltk_lite.parse import * 
 12  from nltk_lite.probability import ImmutableProbabilisticMixIn 
 13   
14 -class WeightedProduction(Production, ImmutableProbabilisticMixIn):
15 """ 16 A probabilistic context free grammar production. 17 PCFG C{WeightedProduction}s are essentially just C{Production}s that 18 have probabilities associated with them. These probabilities are 19 used to record how likely it is that a given production will 20 be used. In particular, the probability of a C{WeightedProduction} 21 records the likelihood that its right-hand side is the correct 22 instantiation for any given occurance of its left-hand side. 23 24 @see: L{Production} 25 """
26 - def __init__(self, lhs, rhs, **prob):
27 """ 28 Construct a new C{WeightedProduction}. 29 30 @param lhs: The left-hand side of the new C{WeightedProduction}. 31 @type lhs: L{Nonterminal} 32 @param rhs: The right-hand side of the new C{WeightedProduction}. 33 @type rhs: sequence of (C{Nonterminal} and (terminal)) 34 @param **prob: Probability parameters of the new C{WeightedProduction}. 35 """ 36 ImmutableProbabilisticMixIn.__init__(self, **prob) 37 Production.__init__(self, lhs, rhs)
38
39 - def __str__(self):
40 return Production.__str__(self) + ' [%s]' % self.prob()
41
42 - def __eq__(self, other):
43 return (isinstance(other, self.__class__) and 44 self._lhs == other._lhs and 45 self._rhs == other._rhs and 46 self.prob() == other.prob())
47
48 - def __hash__(self):
49 return hash((self._lhs, self._rhs, self.prob()))
50
51 -class WeightedGrammar(Grammar):
52 """ 53 A probabilistic context-free grammar. A Weighted Grammar consists of a start 54 state and a set of weighted productions. The set of terminals and 55 nonterminals is implicitly specified by the productions. 56 57 PCFG productions should be C{WeightedProduction}s. C{WeightedGrammar}s impose 58 the constraint that the set of productions with any given 59 left-hand-side must have probabilities that sum to 1. 60 61 If you need efficient key-based access to productions, you can use 62 a subclass to implement it. 63 64 @type EPSILON: C{float} 65 @cvar EPSILON: The acceptable margin of error for checking that 66 productions with a given left-hand side have probabilities 67 that sum to 1. 68 """ 69 EPSILON = 0.01 70
71 - def __init__(self, start, productions):
72 """ 73 Create a new context-free grammar, from the given start state 74 and set of C{WeightedProduction}s. 75 76 @param start: The start symbol 77 @type start: L{Nonterminal} 78 @param productions: The list of productions that defines the grammar 79 @type productions: C{list} of C{Production} 80 @raise ValueError: if the set of productions with any left-hand-side 81 do not have probabilities that sum to a value within 82 EPSILON of 1. 83 """ 84 Grammar.__init__(self, start, productions) 85 86 # Make sure that the probabilities sum to one. 87 probs = {} 88 for production in productions: 89 probs[production.lhs()] = (probs.get(production.lhs(), 0) + 90 production.prob()) 91 for (lhs, p) in probs.items(): 92 if not ((1-WeightedGrammar.EPSILON) < p < (1+WeightedGrammar.EPSILON)): 93 raise ValueError("Productions for %r do not sum to 1" % lhs)
94
95 -def induce(start, productions):
96 """ 97 Induce a PCFG grammar from a list of productions. 98 99 The probability of a production A -> B C in a PCFG is: 100 101 | count(A -> B C) 102 | P(B, C | A) = --------------- where * is any right hand side 103 | count(A -> *) 104 105 @param start: The start symbol 106 @type start: L{Nonterminal} 107 @param productions: The list of productions that defines the grammar 108 @type productions: C{list} of L{Production} 109 """ 110 111 pcount = {} # Production count: the number of times a given production occurs 112 lcount = {} # LHS-count: counts the number of times a given lhs occurs 113 114 for prod in productions: 115 lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 116 pcount[prod] = pcount.get(prod, 0) + 1 117 118 prods = [WeightedProduction(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()])\ 119 for p in pcount] 120 return WeightedGrammar(start, prods)
121 122 ################################################################# 123 # Parsing PCFGs 124 ################################################################# 125 126 _PARSE_RE = re.compile(r'''^\s* # leading whitespace 127 (\w+(?:/\w+)?)\s* # lhs 128 (?:[-=]+>)\s* # arrow 129 (?:( # rhs: 130 "[^"]+" # doubled-quoted terminal 131 | '[^']+' # single-quoted terminal 132 | \w+(?:/\w+)? # non-terminal 133 | \[[01]?\.\d+\] # probability 134 | \| # disjunction 135 ) 136 \s*) # trailing space 137 *$''', # zero or more copies 138 re.VERBOSE) 139 _SPLIT_RE = re.compile(r'''(\w+(?:/\w+)?|\[[01]?\.\d+\]|[-=]+>|"[^"]+"|'[^']+'|\|)''') 140
141 -def parse_pcfg_production(s):
142 """ 143 Returns a list of PCFG productions 144 """ 145 # Use _PARSE_RE to check that it's valid. 146 if not _PARSE_RE.match(s): 147 raise ValueError, 'Bad production string' 148 # Use _SPLIT_RE to process it. 149 pieces = _SPLIT_RE.split(s) 150 pieces = [p for i,p in enumerate(pieces) if i%2==1] 151 lhside = Nonterminal(pieces[0]) 152 rhsides = [[]] 153 probabilities = [0.0] 154 found_terminal = found_non_terminal = False 155 for piece in pieces[2:]: 156 if piece == '|': 157 rhsides.append([]) # Vertical bar 158 probabilities.append(0.0) 159 found_terminal = found_non_terminal = False 160 elif piece[0] in ('"', "'"): 161 rhsides[-1].append(piece[1:-1]) # Terminal 162 found_terminal = True 163 elif piece[0] in "[": 164 probabilities[-1] = float(piece[1:-1]) # Probability 165 else: 166 rhsides[-1].append(Nonterminal(piece)) # Nonterminal 167 found_non_terminal = True 168 if found_terminal and found_non_terminal: 169 raise ValueError, 'Bad right-hand-side: do not mix terminals and non-terminals' 170 return [WeightedProduction(lhside, rhside, prob=probability) 171 for (rhside, probability) in zip(rhsides, probabilities)]
172
173 -def parse_pcfg(s):
174 productions = [] 175 for linenum, line in enumerate(s.split('\n')): 176 line = line.strip() 177 if line.startswith('#') or line=='': continue 178 try: productions += parse_pcfg_production(line) 179 except ValueError: 180 raise ValueError, 'Unable to parse line %s: %s' % (linenum, line) 181 if len(productions) == 0: 182 raise ValueError, 'No productions found!' 183 start = productions[0].lhs() 184 return WeightedGrammar(start, productions)
185 186 187 toy1 = parse_pcfg(""" 188 S -> NP VP [1.0] 189 NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] 190 Det -> 'the' [0.8] | 'my' [0.2] 191 N -> 'man' [0.5] | 'telescope' [0.5] 192 VP -> VP PP [0.1] | V NP [0.7] | V [0.2] 193 V -> 'ate' [0.35] | 'saw' [0.65] 194 PP -> P NP [1.0] 195 P -> 'with' [0.61] | 'under' [0.39] 196 """) 197 198 toy2 = parse_pcfg(""" 199 S -> NP VP [1.0] 200 VP -> V NP [.59] 201 VP -> V [.40] 202 VP -> VP PP [.01] 203 NP -> Det N [.41] 204 NP -> Name [.28] 205 NP -> NP PP [.31] 206 PP -> P NP [1.0] 207 V -> 'saw' [.21] 208 V -> 'ate' [.51] 209 V -> 'ran' [.28] 210 N -> 'boy' [.11] 211 N -> 'cookie' [.12] 212 N -> 'table' [.13] 213 N -> 'telescope' [.14] 214 N -> 'hill' [.5] 215 Name -> 'Jack' [.52] 216 Name -> 'Bob' [.48] 217 P -> 'with' [.61] 218 P -> 'under' [.39] 219 Det -> 'the' [.41] 220 Det -> 'a' [.31] 221 Det -> 'my' [.28] 222 """) 223 224 ################################################################# 225 # Demonstration 226 ################################################################# 227
228 -def demo():
229 """ 230 A demonstration showing how PCFG C{Grammar}s can be created and used. 231 """ 232 233 from nltk_lite.corpora import treebank, extract 234 from nltk_lite.parse import cfg, pcfg, pchart, treetransforms 235 from itertools import islice 236 237 pcfg_prods = pcfg.toy1.productions() 238 239 pcfg_prod = pcfg_prods[2] 240 print 'A PCFG production:', `pcfg_prod` 241 print ' pcfg_prod.lhs() =>', `pcfg_prod.lhs()` 242 print ' pcfg_prod.rhs() =>', `pcfg_prod.rhs()` 243 print ' pcfg_prod.prob() =>', `pcfg_prod.prob()` 244 print 245 246 grammar = pcfg.toy2 247 print 'A PCFG grammar:', `grammar` 248 print ' grammar.start() =>', `grammar.start()` 249 print ' grammar.productions() =>', 250 # Use string.replace(...) is to line-wrap the output. 251 print `grammar.productions()`.replace(',', ',\n'+' '*26) 252 print 253 254 # extract productions from three trees and induce the PCFG 255 print "Induce PCFG grammar from treebank data:" 256 257 productions = [] 258 for tree in islice(treebank.parsed(),3): 259 # perform optional in-place tree transformations, e.g.: 260 # treetransforms.collapseUnary(tree, collapsePOS = False) 261 # treetransforms.chomskyNormalForm(tree, horzMarkov = 2) 262 263 productions += tree.productions() 264 265 S = Nonterminal('S') 266 grammar = pcfg.induce(S, productions) 267 print grammar 268 print 269 270 print "Parse sentence using induced grammar:" 271 272 parser = pchart.InsideParse(grammar) 273 parser.trace(3) 274 275 sent = extract(0, treebank.raw()) 276 print sent 277 for parse in parser.get_parse_list(sent): 278 print parse
279 280 if __name__ == '__main__': 281 demo() 282