Package nltk_lite :: Package tokenize :: Module regexp'
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.tokenize.regexp'

  1  # Natural Language Toolkit: Tokenizers 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  #         Trevor Cohn <tacohn@csse.unimelb.edu.au> 
  7  # URL: <http://nltk.sourceforge.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Functions for tokenizing a text, based on a regular expression 
 12  which matches tokens or gaps. 
 13  """ 
 14   
 15  import re, sre_parse, sre_constants, sre_compile 
 16   
 17  WHITESPACE = r'\s+' 
 18  NEWLINE    = r'\n' 
 19  BLANKLINE  = r'\s*\n\s*\n\s*' 
 20  WORD       = r'\w+' 
 21  WORDPUNCT  = r'[a-zA-Z]+|[^a-zA-Z\s]+' 
 22  SHOEBOXSEP = r'^\\' 
 23  TREEBANK   = r'^\(.*?(?=^\(|\Z)' 
 24   
25 -def _remove_group_identifiers(parsed_re):
26 """ 27 Modifies the given parsed regular expression, replacing all groupings 28 (as indicated by parenthesis in the regular expression string) with 29 non-grouping variants (indicated with '(?:...)'). This works on the 30 output of sre_parse.parse, modifing the group indentifier in 31 SUBPATTERN structures to None. 32 33 @param parsed_re: the output of sre_parse.parse(string) 34 @type parsed_re: C{SubPattern} 35 """ 36 if isinstance(parsed_re, sre_parse.SubPattern): 37 # If it's a SubPattern, replace each item with its processed 38 # equivalent. These classes are mutable, so that in-place 39 # modification is allowed. 40 for i in range(len(parsed_re)): 41 parsed_re[i] = _remove_group_identifiers(parsed_re[i]) 42 return parsed_re 43 elif isinstance(parsed_re, list) or isinstance(parsed_re, tuple): 44 # Otherwise, if it's a sequence, check for the tell-tale 45 # SUBPATTERN item and repair the sub item if needed 46 to_process = list(parsed_re) 47 if to_process[0] == sre_constants.SUBPATTERN: 48 # replace next int with None 49 sub_item = list(to_process[1]) 50 sub_item[0] = None 51 to_process[1] = tuple(sub_item) 52 53 # Process each item, in the case of nested SUBPATTERNS 54 processed = map(_remove_group_identifiers, to_process) 55 56 # Coerce back into the original type 57 if isinstance(parsed_re, list): 58 return processed 59 else: 60 return tuple(processed) 61 else: 62 # Don't need to do anything to other types 63 return parsed_re
64 65 # Replace any grouping parentheses with non-grouping ones. We 66 # need to do this, because the list returned by re.sub will 67 # contain an element corresponding to every set of grouping 68 # parentheses. We must not touch escaped parentheses, and 69 # need to handle the case of escaped escapes (e.g. "\\("). 70 # We also need to handle nested parentheses, which means our 71 # regexp contexts must be zero-width. There are also issues with 72 # parenthesis appearing in bracketed contexts, hence we've 73 # operated on the intermediate parse structure from sre_parse. 74
75 -def _compile(regexp):
76 77 parsed = sre_parse.parse(regexp) 78 parsed = _remove_group_identifiers(parsed) 79 80 # Add grouping parentheses around the regexp; this will allow 81 # us to access the material that was split on. 82 # Need to set the Pattern to expect a single group 83 84 pattern = sre_parse.Pattern() 85 pattern.groups += 1 86 grouped = sre_parse.SubPattern(pattern) 87 grouped.append((sre_constants.SUBPATTERN, (1, parsed))) 88 89 return sre_compile.compile(grouped, re.UNICODE | re.MULTILINE | re.DOTALL)
90
91 -def token_split(text, pattern, advanced=False):
92 """ 93 @return: An iterator that generates tokens and the gaps between them 94 """ 95 96 if advanced: 97 regex = _compile(pattern) # pattern contains () 98 else: 99 regex = re.compile(pattern, re.UNICODE | re.MULTILINE | re.DOTALL) 100 101 # If it's a single string, then convert it to a tuple 102 # (which we can iterate over, just like an iterator.) 103 if isinstance(text, (str, unicode)): 104 text = (text,) 105 106 # Process each substring returned by the iterator, in turn. 107 # "leftover" is used to record any leftover material when we 108 # move on to a new substring. 109 leftover = '' 110 offset = 0 111 for substring in text: 112 position = 0 # The position within the substring 113 114 # Skip any matching material in the substring: 115 match = regex.match(substring) 116 if match: 117 yield leftover+substring[position:match.start()] 118 yield substring[match.start():match.end()] 119 position = match.end() 120 leftover = '' 121 122 # Walk through the substring, looking for matches. 123 while position < len(substring): 124 match = regex.search(substring, position) 125 if match: 126 yield leftover+substring[position:match.start()] 127 yield substring[match.start():match.end()] 128 position = match.end() 129 leftover = '' 130 else: 131 leftover = substring[position:] 132 break 133 134 # Update the offset 135 offset += position 136 137 # If the last string had leftover, then return it. 138 if leftover: 139 yield leftover
140
141 -def regexp(text, pattern, gaps=False, advanced=False):
142 """ 143 Tokenize the text according to the regular expression pattern. 144 145 @param text: the string or string iterator to be tokenized 146 @type text: C{string} or C{iter(string)} 147 @param pattern: the regular expression 148 @type pattern: C{string} 149 @param gaps: set to True if the pattern matches material between tokens 150 @type gaps: C{boolean} 151 @param advanced: set to True if the pattern is complex, making use of () groups 152 @type advanced: C{boolean} 153 @return: An iterator over tokens 154 """ 155 156 for (i,token) in enumerate(token_split(text, pattern, advanced)): 157 if ((i%2==0) == gaps and token != ''): 158 yield token
159
160 -def whitespace(s):
161 """ 162 Tokenize the text at whitespace. 163 164 @param s: the string or string iterator to be tokenized 165 @type s: C{string} or C{iter(string)} 166 @return: An iterator over tokens 167 """ 168 return regexp(s, pattern=WHITESPACE, gaps=True)
169
170 -def line(s):
171 """ 172 Tokenize the text into lines. 173 174 @param s: the string or string iterator to be tokenized 175 @type s: C{string} or C{iter(string)} 176 @return: An iterator over tokens 177 """ 178 return regexp(s, pattern=NEWLINE, gaps=True)
179
180 -def blankline(s):
181 """ 182 Tokenize the text into paragraphs (separated by blank lines). 183 184 @param s: the string or string iterator to be tokenized 185 @type s: C{string} or C{iter(string)} 186 @return: An iterator over tokens 187 """ 188 return regexp(s, pattern=BLANKLINE, gaps=True)
189
190 -def word(s):
191 """ 192 Tokenize the text into sequences of word characters (a-zA-Z0-9). 193 194 @param s: the string or string iterator to be tokenized 195 @type s: C{string} or C{iter(string)} 196 @return: An iterator over tokens 197 """ 198 return regexp(s, pattern=WORD)
199
200 -def wordpunct(s):
201 """ 202 Tokenize the text into sequences of alphabetic and non-alphabetic 203 characters. E.g. "She said 'hello.'" would be tokenized to 204 ["She", "said", "'", "hello", ".'"] 205 206 @param s: the string or string iterator to be tokenized 207 @type s: C{string} or C{iter(string)} 208 @return: An iterator over tokens 209 """ 210 return regexp(s, pattern=WORDPUNCT)
211
212 -def shoebox(s):
213 """ 214 Tokenize a Shoebox entry into its fields (separated by backslash markers). 215 216 @param s: the string or string iterator to be tokenized 217 @type s: C{string} or C{iter(string)} 218 @return: An iterator over tokens 219 """ 220 return regexp(s, pattern=SHOEBOXSEP, gaps=True)
221
222 -def treebank(s):
223 """ 224 Tokenize a Treebank file into its tree strings 225 226 @param s: the string or string iterator to be tokenized 227 @type s: C{string} or C{iter(string)} 228 @return: An iterator over tokens 229 """ 230 return regexp(s, pattern=TREEBANK, advanced=True)
231 232 ##////////////////////////////////////////////////////// 233 ## Demonstration 234 ##////////////////////////////////////////////////////// 235
236 -def _display(tokens):
237 """ 238 A helper function for L{demo} that displays a list of tokens. 239 """ 240 241 str = ' '+`list(tokens)`+' ' # an indented string representation 242 str = re.sub(r"(.{,70})\s", r'\1\n ', str).rstrip() # wrap at 70 characters 243 244 # Truncate after three lines: 245 str = re.sub(r'(.+\n.+\n.+)\s\S+\n[\s\S]+(?!$)', r'\1 ...]', str) 246 247 print str
248
249 -def demo():
250 """ 251 A demonstration that shows the output of several different 252 tokenizers on the same string. 253 """ 254 255 from nltk_lite import tokenize 256 257 # Define the test string. 258 s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." 259 print 'Input text:' 260 print `s` 261 print 262 print 'Tokenize using whitespace:' 263 _display(tokenize.whitespace(s)) 264 print 265 print 'Tokenize sequences of alphanumeric characters:' 266 _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) 267 print 268 print 'Tokenize sequences of letters and sequences of nonletters:' 269 _display(tokenize.wordpunct(s)) 270 print 271 print 'Tokenize by lines:' 272 _display(tokenize.line(s)) 273 print 274 print 'Tokenize by blank lines:' 275 _display(tokenize.blankline(s)) 276 print 277 print 'A simple sentence tokenizer:' 278 _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) 279 print
280 281 if __name__ == '__main__': 282 demo() 283