Package nltk_lite :: Package contrib :: Module concord
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.concord

  1  # Natural Language Toolkit: Concordance System 
  2  # 
  3  # Copyright (C) 2005 University of Melbourne 
  4  # Author: Peter Spiller 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  from nltk_lite.corpora import brown 
  9  from math import * 
 10  import re, string 
 11  from nltk_lite.probability import * 
 12   
13 -class SentencesIndex(object):
14 """Class implementing an index of a collection of sentences. 15 16 Given a list of sentences, where each sentence is a list of words, 17 this class generates an index of the list. Each word should be a (word, POS 18 tag) pair. The index is stored as a dictionary, with the hashable items as 19 keys and a list of (sentence number, word number) tuples as values. This 20 class also generates a list of sentence lengths. 21 """ 22
23 - def __init__(self, sentences):
24 """ Constructor. Takes the list of sentences to index. 25 26 @type sentences: list 27 @param sentences: List of sentences to index. Sentences should be 28 lists of (string, string) pairs. 29 """ 30 31 sentenceCount = 0 32 self.index = {} 33 self.lengths = [] 34 35 # for each sentence: 36 for sentence in sentences: 37 # add the sentences length to the list of sentence lengths 38 self.lengths.append(len(sentence)) 39 wordCount = 0 40 for word in sentence: 41 self.index[word] = self.index.get(word, []) + [(sentenceCount, wordCount)] 42 wordCount += 1 43 sentenceCount += 1
44
45 - def getIndex(self):
46 """ Returns the index dictionary. 47 48 @rtype: dictionary 49 @returns: The dictionary containing the index. 50 """ 51 return self.index
52
53 - def getSentenceLengths(self):
54 """ Returns the list of sentence lengths. 55 56 Element 0 is the length of the first sentence, element 1 the second, 57 etc. 58 59 @rtype: list 60 @returns: List of lengths of sentences. 61 """ 62 return self.lengths
63
64 -class IndexConcordance(object):
65 """ Class that generates concordances from a list of sentences. 66 67 Uses an index for efficiency. If a SentencesIndex object is provided, 68 it will be used, otherwise one will be constructed from the list of 69 sentences. When generating a concordance, the supplied regular expression 70 is used to filter the list of words in the index. Any that match are looked 71 up in the index, and their lists of (sentence number, word number) pairs are 72 used to extract the correct amount of context from the sentences. 73 74 Although this class also allows regular expressions to be specified for the 75 left and right context, they are not used on the index. If only left/right 76 regexps are provided, the class will essentially generate a concordance for 77 every word in the corpus, then filter it with the regexps. This will not be 78 very efficient and requires very large amounts of memory. 79 80 @cvar SORT_WORD: Constant for sorting by target word. 81 @cvar SORT_POS: Constant for sorting by target word's POS tag. 82 @cvar SORT_NUM: Constant for sorting by sentence number. 83 @cvar SORT_RIGHT_CONTEXT: Constant for sorting by the first word of the 84 right context. 85 """ 86 87 # constants for different types of sort 88 89 SORT_WORD = 0 90 SORT_POS = 1 91 SORT_NUM = 2 92 SORT_RIGHT_CONTEXT = 3 93
94 - def __init__(self, sentences, index=None):
95 """ Constructor. 96 97 Arguments: 98 @type sentences: list 99 @param sentences: List of sentences to create a concordance for. 100 Sentences should be lists of (string, string) pairs. 101 @type index: SentencesIndex 102 @param index: SentencesIndex object to use as an index. If this is 103 not provided, one will be generated. 104 """ 105 106 self.sentences = sentences 107 self.index = index 108 # generate an index if one wasn't provided 109 if self.index == None: 110 self.index = SentencesIndex(self.sentences)
111
112 - def formatted(self, leftRegexp=None, middleRegexp=".*", rightRegexp=None, 113 leftContextLength=3, rightContextLength=3, contextInSentences=False, 114 contextChars=50, maxKeyLength=0, showWord=True, 115 sort=0, showPOS=True, flipWordAndPOS=False, verbose=False):
116 """Generates and displays keyword-in-context formatted concordance data. 117 118 This is a convenience method that combines raw() and display()'s 119 options. Unless you need raw output, this is probably the most useful 120 method. 121 122 @type leftRegexp: string 123 @param leftRegexp: Regular expression applied to the left context 124 to filter output. Defaults to None. 125 @type middleRegexp: string 126 @param middleRegexp: Regular expression applied to target word to 127 filter output. Defaults to ".*" (ie everything). 128 @type rightRegexp: string 129 @param rightRegexp: Regular expression applied to the right context 130 to filter output. Defaults to None. 131 @type leftContextLength: number 132 @param leftContextLength: Length of left context. Defaults to 3. 133 @type rightContextLength: number 134 @param rightContextLength: Length of right context. Defaults to 3. 135 @type contextInSentences: number 136 @param contextInSentences: Determines whether the context lengths 137 arguments are in words or sentences. If false, the context lengths 138 are in words - a rightContextLength argument of 2 results in two 139 words of right context. If true, a rightContextLength argument of 2 140 results in a right context consisting of the portion of the target 141 word's sentence to the right of the target, plus the two sentences 142 to the right of that sentence. Defaults to False. 143 @type contextChars number 144 @param contextChars: Amount of context to show. If set to less than 145 0, does not limit amount of context shown 146 (may look ugly). Defaults to 55. 147 @type maxKeyLength: number 148 @param maxKeyLength: Max number of characters to show for the 149 target word. If 0 or less, this value is 150 calculated so as to fully show all target 151 words. Defaults to 0. 152 @type showWord: boolean 153 @param showWord: Whether to show words. Defaults to True. 154 @type sort: integer 155 @param sort: Should be set to one the provided SORT constants. If 156 SORT_WORD, the output is sorted on the target word. If SORT_POS, the 157 output is sorted on the target word's POS tag. If SORT_NUM, the 158 output is sorted by sentence number. If SORT_RIGHT_CONTEXT, the 159 output is sorted on the first word of the right context. Defaults to 160 SORT_WORD. 161 @type showPOS: boolean 162 @param showPOS: Whether to show POS tags. Defaults to True. 163 @type flipWordAndPOS: boolean 164 @param flipWordAndPOS: If true, displays POS tags first instead of 165 words (ie prints 'cc/and' instead of 'and/cc'). Defaults to False. 166 @type verbose: boolean 167 @param verbose: Displays some extra status information. Defaults 168 to False. 169 """ 170 171 self.format(self.raw(leftRegexp, middleRegexp, rightRegexp, leftContextLength, 172 rightContextLength, contextInSentences, sort, verbose), contextChars, 173 maxKeyLength, showWord, showPOS, flipWordAndPOS, verbose)
174
175 - def raw(self, leftRegexp=None, middleRegexp=".*", rightRegexp=None, 176 leftContextLength=3, rightContextLength=3, contextInSentences=False, 177 sort=0, verbose=False):
178 """ Generates and returns raw concordance data. 179 180 Regular expressions supplied are evaluated over the appropriate part of 181 each line of the concordance. For the purposes of evaluating the regexps, 182 the lists of (word, POS tag) tuples are flattened into a space-separated 183 list of word/POS tokens (ie the word followed by '/' followed by the POS 184 tag). A regexp like '^must/.*' matches the word 'must' with any POS tag, 185 while one like '.*/nn$' matches any word with a POS tag of 'nn'. All 186 regexps are evaluated over lowercase versions of the text. 187 188 @type leftRegexp: string 189 @param leftRegexp: Regular expression applied to the left context 190 to filter output. Defaults to None. 191 @type middleRegexp: string 192 @param middleRegexp: Regular expression applied to target word to 193 filter output. Defaults to ".*" (ie everything). 194 @type rightRegexp: string 195 @param rightRegexp: Regular expression applied to the right context 196 to filter output. Defaults to None. 197 @type leftContextLength: number 198 @param leftContextLength: Length of left context. Defaults to 3. 199 @type rightContextLength: number 200 @param rightContextLength: Length of right context. Defaults to 3. 201 @type contextInSentences: number 202 @param contextInSentences: Determines whether the context lengths 203 arguments are in words or sentences. If false, the context lengths 204 are in words - a rightContextLength argument of 2 results in two 205 words of right context. If true, a rightContextLength argument of 2 206 results in a right context consisting of the portion of the target 207 word's sentence to the right of the target, plus the two sentences 208 to the right of that sentence. Defaults to False. 209 @type sort: integer 210 @param sort: Should be set to one the provided SORT constants. If 211 SORT_WORD, the output is sorted on the target word. If SORT_POS, the 212 output is sorted on the target word's POS tag. If SORT_NUM, the 213 output is sorted by sentence number. If SORT_RIGHT_CONTEXT, the 214 output is sorted on the first word of the right context. Defaults to 215 SORT_WORD. 216 @type verbose: boolean 217 @param verbose: Displays some extra status information. Defaults 218 to False. 219 @rtype: list 220 @return: Raw concordance ouput. Returned as a list of 221 ([left context], target word, [right context], target word 222 sentence number) tuples. 223 """ 224 # compile the middle regexp. 225 reg = re.compile(middleRegexp) 226 227 if verbose: 228 print "Matching the following target words:" 229 wordLocs = [] 230 # get list of (sentence, word) pairs to get context for 231 for item in self.index.getIndex().iteritems(): 232 if reg.match("/".join([item[0][0].lower(), item[0][1]])): 233 if verbose: 234 print "/".join(item[0]) 235 wordLocs.append(item[1]) 236 237 print "" 238 239 items = [] 240 # if context lengths are specified in words: 241 if contextInSentences == False: 242 # for each list of (sentence, word offset in sentence) pairs: 243 for wordList in wordLocs: 244 # for each (sentence, word offset in sentence) pair: 245 for sentenceNum, offset in wordList: 246 # set pointers to the left- and rightmost sentences to be 247 # looked at to the sentence the target word is in 248 leftCorpusIndex = sentenceNum 249 rightCorpusIndex = sentenceNum 250 # number of words to include in the left context is 251 # initially everything in the sentence up to the target 252 leftLength = offset 253 # number of words to include in the left context is 254 # initially everything in the sentence after the target 255 rightLength = self.index.getSentenceLengths()[sentenceNum] - offset - 1 256 257 # while the length of the left context is less than what we 258 # need, keep decreasing the left corpus index (ie adding 259 # sentences to the left context). 260 while leftLength < leftContextLength: 261 leftCorpusIndex -= 1 262 # if the new corpus index would fall off the end of the 263 # list, stop at 0 264 if(leftCorpusIndex < 0): 265 leftCorpusIndex = 0 266 break 267 # adjust length and offset 268 leftLength += self.index.getSentenceLengths()[leftCorpusIndex] 269 offset += self.index.getSentenceLengths()[leftCorpusIndex] 270 271 # while the length of the right context is less than what we 272 # need, keep increasing the right corpus index (ie adding 273 # sentences to the right context). 274 while rightLength < rightContextLength: 275 rightCorpusIndex += 1 276 try: 277 rightLength += self.index.getSentenceLengths()[rightCorpusIndex] 278 # if the new corpus index falls off the end of the list, 279 # stop at the end 280 except IndexError: 281 rightCorpusIndex -= 1 282 break 283 284 # grab all sentences from the left to right corpus indices, 285 # then flatten them into a single list of words 286 sents = self.sentences[leftCorpusIndex:rightCorpusIndex+1] 287 words = [] 288 for sentence in sents: 289 for word in sentence: 290 words.append(word) 291 292 # select the appropriate sections of context from the list 293 # of words 294 left = words[offset-leftContextLength:offset] 295 target = words[offset] 296 right = words[offset+1:offset+1+rightContextLength] 297 items.append((left, target, right, sentenceNum)) 298 # if context lengths are specified in sentences: 299 else: 300 # for each list of (sentence, word offset in sentence) pairs: 301 for wordList in wordLocs: 302 # for each list of (sentence, word offset in sentence) pairs: 303 for sentenceNum, offset in wordList: 304 # set pointers to the left- and rightmost sentences to be 305 # looked at to the sentence the target word is in 306 leftCorpusIndex = sentenceNum 307 rightCorpusIndex = sentenceNum 308 # number of words to include in the left context is 309 # initially everything in the sentence up to the target 310 leftLength = offset 311 # number of words to include in the left context is 312 # initially everything in the sentence after the target 313 rightLength = self.index.getSentenceLengths()[sentenceNum] - offset - 1 314 # keep track of the number of sentences included in the 315 # left/right context 316 leftSents = 0; 317 rightSents = 0; 318 319 # while we don't have enough sentences in the left context, 320 # keep decreasing the left corpus index 321 while leftSents < leftContextLength: 322 leftCorpusIndex -= 1 323 # if the new corpus index would fall off the end of the 324 # list, stop at 0 325 if(leftCorpusIndex < 0): 326 leftCorpusIndex = 0 327 break 328 leftLength += self.index.getSentenceLengths()[leftCorpusIndex] 329 offset += self.index.getSentenceLengths()[leftCorpusIndex] 330 leftSents += 1 331 332 # while we don't have enough sentences in the right context, 333 # keep increasing the right corpus index 334 while rightSents < rightContextLength: 335 rightCorpusIndex += 1 336 try: 337 rightLength += self.index.getSentenceLengths()[rightCorpusIndex] 338 rightSents += 1 339 # if the new corpus index falls off the end of the list, 340 # stop at the end 341 except IndexError: 342 rightCorpusIndex -= 1 343 break 344 345 # grab all sentences from the left to right corpus indices, 346 # then flatten them into a single list of words 347 sents = self.sentences[leftCorpusIndex:rightCorpusIndex+1] 348 words = [] 349 for sentence in sents: 350 for word in sentence: 351 words.append(word) 352 353 # select the appropriate sections of context from the list 354 # of words 355 left = words[0:offset] 356 target = words[offset] 357 right = words[offset+1:] 358 items.append((left, target, right, sentenceNum)) 359 360 if verbose: 361 print "Found %d matches for target word..." % len(items) 362 363 # sort the concordance 364 if sort == self.SORT_WORD: 365 if verbose: 366 print "Sorting by target word..." 367 items.sort(key=lambda i:i[1][0].lower()) 368 elif sort == self.SORT_POS: 369 if verbose: 370 print "Sorting by target word POS tag..." 371 items.sort(key=lambda i:i[1][1].lower()) 372 elif sort == self.SORT_NUM: 373 if verbose: 374 print "Sorting by sentence number..." 375 items.sort(key=lambda i:i[3]) 376 elif sort == self.SORT_RIGHT_CONTEXT: 377 if verbose: 378 print "Sorting by first word of right context..." 379 items.sort(key=lambda i:i[2][0][0]) 380 381 # if any regular expressions have been given for the context, filter 382 # the concordance using them 383 filtered = [] 384 filterBool = False 385 if leftRegexp != None or rightRegexp != None: 386 filterBool = True 387 if filterBool: 388 389 leftRe=None 390 rightRe=None 391 if leftRegexp != None: 392 if verbose: 393 print "Filtering on left context..." 394 leftRe = re.compile(leftRegexp) 395 if rightRegexp != None: 396 if verbose: 397 print "Filtering on right context..." 398 rightRe = re.compile(rightRegexp) 399 400 for item in items: 401 if self._matches(item, leftRe, rightRe): 402 filtered.append(item) 403 404 if filterBool: 405 source = filtered 406 else: 407 source = items 408 409 return source
410
411 - def format(self, source, contextChars=55, maxKeyLength=0, showWord=True, 412 showPOS=True, flipWordAndPOS=False, verbose=False):
413 """Formats raw concordance output produced by raw(). 414 415 Displays a concordance in keyword-in-context style format. 416 417 @type source: list 418 @param source: Raw concordance output to format. Expects a list of 419 ([left context], target word, [right context], target 420 word sentence number) tuples. 421 @type contextChars number 422 @param contextChars: Amount of context to show. If set to less than 423 0, does not limit amount of context shown (may look ugly). Defaults to 55. 424 @type maxKeyLength: number 425 @param maxKeyLength: Max number of characters to show for the 426 target word. If 0 or less, this value is 427 calculated so as to fully show all target 428 words. Defaults to 0. 429 @type showWord: boolean 430 @param showWord: Whether to show words. Defaults to True. 431 @type showPOS: boolean 432 @param showPOS: Whether to show POS tags. Defaults to True. 433 @type flipWordAndPOS: boolean 434 @param flipWordAndPOS: If true, displays POS tags first instead of 435 words (ie prints 'cc/and' instead of 'and/cc'). Defaults to False. 436 @type verbose: boolean 437 @param verbose: Displays some extra status information. Defaults 438 to False. 439 """ 440 441 # flatten lists of tokens into strings 442 lines = [] 443 maxMiddleLength = -1 444 445 # generate intermediate list of string tuples 446 for line in source: 447 # flatten left context tokens into a single string, joining words 448 # and their POS tag with a '/' (if both are shown). 449 left = "" 450 for item in line[0]: 451 if item[0] == "" and item[1] == "": 452 left = "" 453 elif showWord and (not showPOS): 454 left += item[0] + " " 455 elif (not showWord) and showPOS: 456 left += item[1] + " " 457 elif flipWordAndPOS: 458 left += item[1] + "/" + item[0] + " " 459 else: 460 left += "/".join(item) + " " 461 462 # flatten target word into a single string, joining the word and 463 # its POS tag with a '/' (if both are shown). 464 if showWord and (not showPOS): 465 middle = line[1][0] 466 elif (not showWord) and showPOS: 467 middle = line[1][1] 468 elif flipWordAndPOS: 469 middle = line[1][1] + "/" + line[1][0] + " " 470 else: 471 middle = "/".join(line[1]) 472 473 if len(middle) > maxMiddleLength: 474 maxMiddleLength = len(middle) 475 476 # flatten right context tokens into a single string, joining words 477 # and their POS tag with a '/' (if both are shown). 478 right = "" 479 for item in line[2]: 480 if item[0] == "" and item[1] == "": 481 right = "" 482 elif showWord and (not showPOS): 483 right += item[0] + " " 484 elif (not showWord) and showPOS: 485 right += item[1] + " " 486 elif flipWordAndPOS: 487 right += item[1] + "/" + item[0] + " " 488 else: 489 right += "/".join(item) + " " 490 491 num = line[3] 492 493 lines.append((middle, left, right, num)) 494 495 # crop and justify strings to generate KWIC-format output 496 count = 0 497 for middle, left, right, num in lines: 498 # calculate amount of left padding needed 499 leftPaddingLength = contextChars - len(left) 500 if leftPaddingLength < 0: 501 leftPaddingLength = 0 502 if len(left) > contextChars and contextChars > -1: 503 left = left[-contextChars:] 504 left = " "*leftPaddingLength + left 505 if contextChars > -1: 506 right = right[0:contextChars] 507 508 # add sentence numbers 509 left = str(num) + ": " + left[len(str(num))+2 : ] 510 511 # calculate amount of middle padding needed 512 if maxKeyLength > 0: 513 maxMiddleLength = maxKeyLength 514 lPad = int(ceil(max(maxMiddleLength - len(middle), 0) / 2.0)) 515 rPad = int(floor(max(maxMiddleLength - len(middle), 0) / 2.0)) 516 middle = " "*lPad + middle + " "*rPad 517 518 print left + "| " + middle + " | " + right + " " 519 count += 1 520 521 if verbose: 522 print "\n" + repr(count) + " lines"
523
524 - def _matches(self, item, leftRe, rightRe):
525 """ Private method that runs the given regexps over a raw concordance 526 item and returns whether they match it. 527 """ 528 left = item[0] 529 right = item[2] 530 531 # flatten left and right contexts 532 leftString = "" 533 for token in left: 534 leftString += "/".join(token) + " " 535 rightString = "" 536 for token in right: 537 rightString += "/".join(token) + " " 538 539 # see if regexps match 540 ok = True 541 if leftRe != None and leftRe.match(leftString) == None: 542 ok = False 543 if rightRe != None and rightRe.match(rightString) == None: 544 ok = False 545 546 if ok: 547 return True 548 else: 549 return False
550
551 -class Aggregator(object):
552 """ Class for aggregating and summarising corpus concordance data. 553 554 This class allows one or more sets of concordance data to be summarised and 555 displayed. This is useful for corpus linguistic tasks like counting the 556 number of occurences of a particular word and its different POS tags in a 557 given corpus, or comparing these frequencies across different corpora. It 558 creates a FreqDist for each set of concordance data, counting how often each 559 unique entry appears in it. 560 561 An example of how to use this class to show the frequency of the five most 562 common digrams of the form "must/md X/Y" in the Brown Corpus sections a 563 and g:: 564 565 concA = IndexConcordance(list(brown.tagged('a'))) 566 rawA = concA.raw(middleRegexp="^must/md$", leftContextLength=0, rightContextLength=1) 567 concG = IndexConcordance(list(brown.tagged('g'))) 568 rawG = concG.raw(middleRegexp="^must/md$", leftContextLength=0, rightContextLength=1) 569 agg = Aggregator() 570 agg.add(rawA, "Brown Corpus A") 571 agg.add(rawG, "Brown Corpus G") 572 agg.formatted(showFirstX=5) 573 574 Output: 575 576 Brown Corpus A 577 ------------------------------ 578 must/md be/be 17 579 must/md have/hv 5 580 must/md not/* 3 581 must/md play/vb 2 582 must/md ''/'' 1 583 584 Brown Corpus G 585 ------------------------------ 586 must/md be/be 38 587 must/md have/hv 21 588 must/md ,/, 6 589 must/md not/* 5 590 must/md always/rb 3 591 """ 592 593 # text for 'other' row in output tables 594 _OTHER_TEXT = "<OTHER>" 595 # text for 'total' row in output tables 596 _TOTAL_TEXT = "<TOTAL>" 597
598 - def __init__(self, inputList=None):
599 """ Constructor. 600 601 @type inputList: list 602 @param inputList: List of (raw concordance data, name) tuples to be 603 entered into the aggregator. Defaults to None. 604 """ 605 self._outputSets = [] 606 if inputList != None: 607 for (item, n) in inputList: 608 self.add(item, name=n)
609
610 - def add(self, raw, name):
611 """ Adds the given set of raw concordance output to the aggregator. 612 613 @type raw: list 614 @param raw: Raw concordance data (produced by IndexConcordance.raw()). 615 Expects a list of ([left context], target word, 616 [right context], target word sentence number) tuples. 617 @type name: string 618 @param name: Name to associate with the set of data. 619 """ 620 self._outputSets.append((raw, name));
621
622 - def remove(self, name):
623 """ Removes all sets of raw concordance output with the given name. 624 625 @type name: string 626 @param name: Name of data set to remove. 627 """ 628 for item in self._outputSets: 629 if item[1] == name: 630 self._outputSets.remove(item)
631
632 - def formatted(self, useWord=True, usePOS=True, normalise=False, 633 threshold=-1, showFirstX=-1, decimalPlaces=4, 634 countOther=False, showTotal=False):
635 """ Displays formatted concordance summary information. 636 637 This is a convenience method that combines raw() and display()'s 638 options. Unless you need raw output, this is probably the most useful 639 method. 640 641 @type useWord: boolean 642 @param useWord: Include the words in the count. Defaults to True. 643 @type usePOS: boolean 644 @param usePOS: Include the POS tags in the count. Defaults to 645 False. 646 @type normalise: boolean 647 @param normalise: If true, normalises the frequencies for each set 648 of concordance output by dividing each key's frequency by the total 649 number of samples in that concordances's FreqDist. Allows easier 650 comparison of results between data sets. Care must be taken when 651 combining this option with the threshold option, as any threshold 652 of 1 or more will prevent any output being displayed. Defaults to 653 False. 654 @type threshold: number 655 @param threshold: Frequency display threshold. Results below this 656 frequency will not be displayed. If less than 0, everything will be 657 displayed. Defaults to -1. 658 @type showFirstX: number 659 @param showFirstX: Only show this many results, starting with the 660 most frequent. If less than 0, everything will be displayed. 661 Defaults to -1. 662 @type decimalPlaces: integer 663 @param decimalPlaces: Number of decimal places of accuracy to 664 display. Used when displaying non-integers with the normalise 665 option. Defaults to 4. 666 @type countOther: boolean 667 @param countOther: If true, any samples not shown (due to their 668 frequency being below the given thershold or because they were 669 after the number of results specified by the showFirstX argument) 670 will be combined into one sample. This sample's frequency is the 671 sum of all unshown sample's frequencies. Defaults to False. 672 @type showTotal: boolean 673 @param showTotal: If true, prints the sum of all frequencies (of 674 the entire FreqDist, not just of the samples displayed.) Defaults 675 to False. 676 """ 677 678 output, maxKeyLength = self.raw(useWord, usePOS) 679 self.format(output, maxKeyLength, threshold, showFirstX, 680 decimalPlaces, normalise, countOther, showTotal)
681
682 - def raw(self, useWord=True, usePOS=True):
683 """ Generates raw summary information. 684 685 Creates a FreqDist for each set of concordance output and uses it to 686 count the frequency of each line in it. The concordance output is 687 flattened from lists of tokens to strings, as lists cannot be hashed. 688 The list of FreqDists is returned, as well as the length of the longest 689 string (used for formatted display). 690 691 @type useWord: boolean 692 @param useWord: Include the words in the count. Defaults to True. 693 @type usePOS: boolean 694 @param usePOS: Include the POS tags in the count. Defaults to 695 False. 696 @rtype: list, number 697 @returns: A list of (FreqDist, name) pairs, and the length of the 698 longest key in all the FreqDists. 699 """ 700 701 output = [] 702 maxKeyLength = 0 703 704 # for each set of raw concordance data: 705 for (rawConcOutput, name) in self._outputSets: 706 # initialise a FreqDist 707 dist = FreqDist() 708 # for each item in the raw concordance output: 709 for (left, middle, right, num) in rawConcOutput: 710 # flatten the lists of tokens so they can be hashed in 711 # the FreqDist 712 leftList = [] 713 for word in left: 714 if usePOS == False and useWord == True: 715 leftList.append(word[0].lower()) 716 elif usePOS == True and useWord == False: 717 leftList.append(word[1].lower()) 718 else: 719 leftList.append(word[0].lower() + "/" + word[1].lower()) 720 try: 721 if usePOS == False and useWord == True: 722 midString = middle[0].lower() 723 elif usePOS == True and useWord == False: 724 midString = middle[1].lower() 725 else: 726 midString = middle[0].lower() + "/" + middle[1].lower() 727 except IndexError: 728 midString = "" 729 730 rightList = [] 731 for word in right: 732 if usePOS == False and useWord == True: 733 rightList.append(word[0].lower()) 734 elif usePOS == True and useWord == False: 735 rightList.append(word[1].lower()) 736 else: 737 rightList.append(word[0].lower() + "/" + word[1].lower()) 738 739 # join the tokens together to form a key string 740 key = string.join(leftList) + " " + midString + " " + string.join(rightList) 741 # keep track of the longest key length 742 if len(key) > maxKeyLength: 743 maxKeyLength = len(key) 744 # increment the FreqDist's count for this key 745 dist.inc(key) 746 747 # add this FreqDist and name to the output 748 output.append((dist, name)) 749 750 # return the output and maximum key length 751 return output, maxKeyLength
752
753 - def format(self, output, maxKeyLength=20, threshold=-1, showFirstX=-1, 754 decimalPlaces=4, normalise=False, countOther=False, 755 showTotal=False):
756 """ Displays concordance summary information. 757 758 Formats and displays information produced by raw(). 759 760 @type output: list 761 @param output: List of (FreqDist, name) pairs (as produced by raw()). 762 @type maxKeyLength: number 763 @param maxKeyLength: Length of longest key. Defaults to 20. 764 @type normalise: boolean 765 @param normalise: If true, normalises the frequencies for each set 766 of concordance output by dividing each key's frequency by the total 767 number of samples in that concordances's FreqDist. Allows easier 768 comparison of results between data sets. Care must be taken when 769 combining this option with the threshold option, as any threshold 770 of 1 or more will prevent any output being displayed. Defaults to 771 False. 772 @type threshold: number 773 @param threshold: Frequency display threshold. Results below this 774 frequency will not be displayed. If less than 0, everything will be 775 displayed. Defaults to -1. 776 @type showFirstX: number 777 @param showFirstX: Only show this many results, starting with the 778 most frequent. If less than 0, everything will be displayed. 779 Defaults to -1. 780 @type decimalPlaces: integer 781 @param decimalPlaces: Number of decimal places of accuracy to 782 display. Used when displaying non-integers with the normalise 783 option. Defaults to 4. 784 @type countOther: boolean 785 @param countOther: If true, any samples not shown (due to their 786 frequency being below the given thershold or because they were 787 after the number of results specified by the showFirstX argument) 788 will be combined into one sample. This sample's frequency is the 789 sum of all unshown sample's frequencies. Defaults to False. 790 @type showTotal: boolean 791 @param showTotal: If true, prints the sum of all frequencies (of 792 the entire FreqDist, not just of the samples displayed.) Defaults 793 to False. 794 """ 795 796 # for each FreqDist: 797 for (dist, name) in output: 798 x = 0 799 other = 0 800 total = 0 801 print name 802 print "-"*(maxKeyLength + 7) 803 # for each key: 804 for key in dist.sorted_samples(): 805 # keep track of how many samples shown, if using the showFirstX 806 # option 807 #if showFirstX > 0 and x >= showFirstX: 808 # break 809 810 # get and format the sample's frequency 811 if normalise: 812 count = 1.0 * dist.count(key) / dist.N() 813 countString = str(count)[0:decimalPlaces + 2] 814 else: 815 count = dist.count(key) 816 countString = str(count) 817 818 total += count 819 820 # if the count is less than the threshold value, or we've 821 # already shown X samples, add this sample's frequency to the 822 # 'other' bin 823 if count < threshold or (showFirstX > 0 and x >= showFirstX): 824 other += count 825 else: 826 print key + " "*(maxKeyLength - len(key) + 1) + countString 827 x += 1 828 829 if countOther: 830 if normalise: 831 count = 1.0 * other 832 countString = str(count)[0:decimalPlaces + 2] 833 else: 834 count = other 835 countString = str(count) 836 print self._OTHER_TEXT + " "*(maxKeyLength - len(self._OTHER_TEXT) + 1) + countString 837 if showTotal: 838 if normalise: 839 count = 1.0 * total 840 countString = str(count)[0:decimalPlaces + 2] 841 else: 842 count = total 843 countString = str(count) 844 print self._TOTAL_TEXT + " "*(maxKeyLength - len(self._TOTAL_TEXT) + 1) + countString 845 print ""
846
847 -def demo():
848 """ 849 Demonstrates how to use IndexConcordance and Aggregator. 850 """ 851 print "Reading Brown Corpus into memory..." 852 corpus = list(brown.tagged(('a','j'))) 853 print "Generating index..." 854 ic = IndexConcordance(corpus) 855 print "Showing all occurences of 'plasma' in the Brown Corpus..." 856 ic.formatted(middleRegexp="^plasma/.*", verbose=True) 857 858 print "Investigating the collocates of 'deal' and derivatives..." 859 agg = Aggregator() 860 agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0, 861 leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates") 862 agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1, 863 rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates") 864 agg.formatted(showFirstX=5, usePOS=False)
865 866 if __name__ == '__main__': 867 demo() 868