Package nltk_lite :: Package tag :: Module unigram
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.tag.unigram

  1  # Natural Language Toolkit: Unigram Taggers 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Classes and interfaces for tagging each token of a document with 
 11  supplementary information, such as its part of speech or its WordNet 
 12  synset tag.  This task, which is known as X{tagging}, is defined by 
 13  the L{TagI} interface. 
 14  """ 
 15   
 16  from nltk_lite.probability import FreqDist, ConditionalFreqDist 
 17   
 18  ############################################################## 
 19  # UNIGRAM TAGGERS: only use information about the current word 
 20  ############################################################## 
 21   
 22  from nltk_lite.tag import * 
 23  import re 
 24   
25 -class Unigram(SequentialBackoff):
26 """ 27 A unigram stochastic tagger. Before C{tag.Unigram} can be 28 used, it should be trained on a tagged corpus. Using this 29 training data, it will find the most likely tag for each word 30 type. It will then use this information to assign the most 31 frequent tag to each word. If C{tag.Unigram} encounters a 32 word which it has no data, it will assign it the 33 tag C{None}. 34 """ 35 yaml_tag = '!tag.Unigram'
36 - def __init__(self, cutoff=1, backoff=None):
37 """ 38 Construct a new unigram stochastic tagger. The new tagger 39 should be trained, using the L{train()} method, before it is 40 used to tag data. 41 """ 42 self._model = {} 43 self._cutoff = cutoff 44 self._backoff = backoff 45 self._history = None
46
47 - def train(self, tagged_corpus, verbose=False):
48 """ 49 Train C{tag.Unigram} using the given training data. 50 51 @param tagged_corpus: A tagged corpus. Each item should be 52 a C{list} of tagged tokens, where each consists of 53 C{text} and a C{tag}. 54 @type tagged_corpus: C{list} or C{iter(list)} 55 """ 56 57 if self.size() != 0: 58 raise ValueError, 'Tagger is already trained' 59 token_count = hit_count = 0 60 fd = ConditionalFreqDist() 61 62 if isinstance(tagged_corpus, list) and isinstance(tagged_corpus[0], tuple): 63 tagged_corpus = [tagged_corpus] 64 65 for sentence in tagged_corpus: 66 for (token, tag) in sentence: 67 token_count += 1 68 fd[token].inc(tag) 69 for token in fd.conditions(): 70 best_tag = fd[token].max() 71 backoff_tag = self._backoff_tag_one(token) 72 hits = fd[token].count(best_tag) 73 74 # is the tag we would assign different from the backoff tagger 75 # and do we have sufficient evidence? 76 if best_tag != backoff_tag and hits > self._cutoff: 77 self._model[token] = best_tag 78 hit_count += hits 79 80 # generate stats 81 if verbose: 82 size = len(self._model) 83 backoff = 100 - (hit_count * 100.0)/ token_count 84 pruning = 100 - (size * 100.0) / len(fd.conditions()) 85 print "[Trained Unigram tagger:", 86 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( 87 size, backoff, pruning)
88
89 - def tag_one(self, token, history=None):
90 if self.size() == 0: 91 raise ValueError, 'Tagger is not trained' 92 if self._model.has_key(token): 93 return self._model[token] 94 if self._backoff: 95 return self._backoff.tag_one(token, history) 96 return None
97
98 - def size(self):
99 return len(self._model)
100
101 - def __repr__(self):
102 return '<Unigram Tagger: size=%d, cutoff=%d>' % ( 103 self.size(), self._cutoff)
104 105 106 # Affix tagger, based on code by Tiago Tresoldi <tresoldi@users.sf.net>
107 -class Affix(SequentialBackoff):
108 """ 109 A unigram tagger that assign tags to tokens based on leading or 110 trailing substrings (it is important to note that the substrings 111 are not necessarily "true" morphological affixes). Before 112 C{tag.Affix} can be used, it should be trained on a tagged 113 corpus. Using this training data, it will find the most likely tag 114 for each word type. It will then use this information to assign 115 the most frequent tag to each word. If the C{tag.Affix} 116 encounters a prefix or suffix in a word for which it has no data, 117 it will assign the tag C{None}. 118 """
119 - def __init__ (self, length, minlength, cutoff=1, backoff=None):
120 """ 121 Construct a new affix stochastic tagger. The new tagger should be 122 trained, using the L{train()} method, before it is used to tag 123 data. 124 125 @type length: C{number} 126 @param length: The length of the affix to be considered during 127 training and tagging (negative for suffixes) 128 @type minlength: C{number} 129 @param minlength: The minimum length for a word to be considered 130 during training and tagging. It must be longer that C{length}. 131 """ 132 # SequentialBackoff.__init__(self) 133 self._model = {} 134 135 assert minlength > 0 136 137 self._length = length 138 self._minlength = minlength 139 self._cutoff = cutoff 140 self._backoff = backoff 141 self._history = None
142
143 - def _get_affix(self, token):
144 if self._length > 0: 145 return token[:self._length] 146 else: 147 return token[self._length:]
148
149 - def train(self, tagged_corpus, verbose=False):
150 """ 151 Train C{tag.Affix} using the given training data. If this 152 method is called multiple times, then the training data will be 153 combined. 154 155 @param tagged_corpus: A tagged corpus. Each item should be 156 a C{list} of tagged tokens, where each consists of 157 C{text} and a C{tag}. 158 @type tagged_corpus: C{list} or C{iter(list)} 159 """ 160 161 if self.size() != 0: 162 raise ValueError, 'Tagger is already trained' 163 token_count = hit_count = 0 164 fd = ConditionalFreqDist() 165 166 for sentence in tagged_corpus: 167 for (token, tag) in sentence: 168 token_count += 1 169 # If token is long enough 170 if len(token) >= self._minlength: 171 backoff_tag = self._backoff_tag_one(token) 172 if tag != backoff_tag: 173 # get the affix and record it 174 affix = self._get_affix(token) 175 hit_count += 1 176 fd[affix].inc(tag) 177 for affix in fd.conditions(): 178 best_tag = fd[affix].max() 179 if fd[affix].count(best_tag) > self._cutoff: 180 self._model[affix] = best_tag 181 # generate stats 182 if verbose: 183 size = len(self._model) 184 backoff = 100 - (hit_count * 100.0)/ token_count 185 pruning = 100 - (size * 100.0) / len(fd.conditions()) 186 print "[Trained Affix tagger:", 187 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( 188 size, backoff, pruning)
189
190 - def tag_one(self, token, history=None):
191 if self.size() == 0: 192 raise ValueError, 'Tagger is not trained' 193 affix = self._get_affix(token) 194 if len(token) >= self._minlength and self._model.has_key(affix): 195 return self._model[affix] 196 if self._backoff: 197 return self._backoff.tag_one(token, history) 198 return None
199
200 - def size(self):
201 return len(self._model)
202
203 - def __repr__(self):
204 return '<Affix Tagger: size=%d, cutoff=%d>' % ( 205 self.size(), self._cutoff)
206 207
208 -class Regexp(SequentialBackoff):
209 """ 210 A tagger that assigns tags to words based on regular expressions. 211 """ 212 yaml_tag = '!tag.Regexp'
213 - def __init__(self, regexps, backoff=None):
214 """ 215 Construct a new regexp tagger. 216 217 @type regexps: C{list} of C{(string,string)} 218 @param regexps: A list of C{(regexp,tag)} pairs, each of 219 which indicates that a word matching C{regexp} should 220 be tagged with C{tag}. The pairs will be evalutated in 221 order. If none of the regexps match a word, then the 222 optional backoff tagger is invoked, else it is 223 assigned the tag C{None}. 224 """ 225 self._regexps = regexps 226 self._backoff = backoff 227 self._history = None
228
229 - def tag_one(self, token, history=None):
230 for regexp, tag in self._regexps: 231 if re.match(regexp, token): # ignore history 232 return tag 233 if self._backoff: 234 return self._backoff.tag_one(token, history) 235 return None
236
237 - def __repr__(self):
238 return '<Regexp Tagger: size=%d>' % len(self._regexps)
239
240 -class Lookup(SequentialBackoff):
241 """ 242 A tagger that assigns tags to words based on a lookup table. 243 """
244 - def __init__(self, table, backoff=None):
245 """ 246 Construct a new lookup tagger. 247 248 @type table: C{dict} from C{string} to C{string} 249 @param table: A dictionary mapping words to tags, 250 which indicates that a particular Cword should be assigned 251 a given Ctag. If none of the regexps match a word, then the 252 optional backoff tagger is invoked, else it is 253 assigned the tag C{None}. 254 """ 255 self._table = table 256 self._backoff = backoff 257 self._history = None
258
259 - def tag_one(self, token, history=None):
260 if token in self._table: 261 return self._table[token] 262 if self._backoff: 263 return self._backoff.tag_one(token, history) 264 return None
265
266 - def __repr__(self):
267 return '<Lookup Tagger: size=%d>' % len(self._table)
268 269 ##////////////////////////////////////////////////////// 270 ## Demonstration 271 ##////////////////////////////////////////////////////// 272
273 -def _demo_tagger(tagger, gold):
274 from nltk_lite.tag import accuracy 275 acc = accuracy(tagger, gold) 276 print 'Accuracy = %4.1f%%' % (100.0 * acc)
277
278 -def demo():
279 """ 280 A simple demonstration function for the C{Tagger} classes. It 281 constructs a backoff tagger using a trigram tagger, bigram tagger 282 unigram tagger and a default tagger. It trains and tests the 283 tagger using the Brown corpus. 284 """ 285 from nltk_lite.corpora import brown 286 from nltk_lite import tag 287 import sys 288 289 print 'Training taggers.' 290 291 # Create a default tagger 292 t0 = tag.Default('nn') 293 294 t1 = tag.Unigram(cutoff=1, backoff=t0) 295 t1.train(brown.tagged('a'), verbose=True) 296 297 t2 = tag.Affix(-3, 5, cutoff=2, backoff=t0) 298 t2.train(brown.tagged('a'), verbose=True) 299 300 t3 = tag.Regexp([(r'.*ed', 'vbd')], backoff=t0) # no training 301 302 t4 = tag.Lookup({'the': 'dt'}, backoff=t0) 303 304 test_tokens = [] 305 num_words = 0 306 307 print '='*75 308 print 'Running the taggers on test data...' 309 print ' Default (nn) tagger: ', 310 sys.stdout.flush() 311 _demo_tagger(t0, brown.tagged('b')) 312 313 print ' Unigram tagger: ', 314 sys.stdout.flush() 315 _demo_tagger(t1, list(brown.tagged('b'))[:1000]) 316 317 print ' Affix tagger: ', 318 sys.stdout.flush() 319 _demo_tagger(t2, list(brown.tagged('b'))[:1000]) 320 321 print ' Regexp tagger: ', 322 sys.stdout.flush() 323 _demo_tagger(t3, list(brown.tagged('b'))[:1000]) 324 325 print ' Lookup tagger: ', 326 sys.stdout.flush() 327 _demo_tagger(t4, list(brown.tagged('b'))[:1000])
328 329 if __name__ == '__main__': 330 demo() 331