Package Martel :: Module Expression
[hide private]
[frames] | no frames]

Source Code for Module Martel.Expression

   1  # Copyright 2000-2001, Dalke Scientific Software, LLC 
   2  # Distributed under the Biopython License Agreement (see the LICENSE file). 
   3   
   4  """Classes for nodes in the Expression tree. 
   5   
   6  Expression 
   7   |--- Any           - match (or don't match) a set of characters 
   8   |--- AnyEol        - match any newline representation ("\n", "\r" or "\r\n") 
   9   |--- Assert        - used for positive and negative lookahead assertions  
  10   |--- AtBeginning   - match the beginning of a line 
  11   |--- AtEnd         - match the end of a line 
  12   |--- Debug         - print a debug message 
  13   |--- Dot           - match any character except newline 
  14   |--- Group         - give a group name to an expression 
  15   |--- GroupRef      - match a previously identified expression 
  16   |--- Literal       - match (or don't match) a single character 
  17   |--- MaxRepeat     - greedy repeat of an expression, within min/max bounds 
  18   |--- NullOp        - does nothing (useful as an initial seed) 
  19   |--- PassThrough   - used when overriding 'make_parser'; match its subexp 
  20   |      |--- FastFeature  - keeps information about possibly optional tags 
  21   |      |--- HeaderFooter - files with a header, records and a footer 
  22   |      `--- ParseRecords - parse a record at a time 
  23   |--- Str           - match a given string 
  24   `--- ExpressionList  - expressions containing several subexpressions 
  25          |--- Alt    - subexp1 or subexp2 or subexp3 or ... 
  26          `--- Seq    - subexp1 followed by subexp2 followed by subexp3 ... 
  27  """ 
  28  import re, string 
  29  from xml.sax import xmlreader 
  30  import msre_parse  # Modified version of Secret Labs' sre_parse 
  31  import Parser 
  32   
  33  MAXREPEAT = msre_parse.MAXREPEAT 
  34   
  35  try: 
  36      import IterParser 
  37  except SyntaxError: 
  38      IterParser = None 
  39   
40 -class Expression:
41 """Base class for nodes in the Expression tree"""
42 - def __add__(self, other):
43 """returns an Expression to match this Expression then the other one""" 44 assert isinstance(other, Expression), "RHS of '+' not an Expression" 45 return Seq( (self, other) )
46
47 - def __or__(self, other):
48 """returns an Expression matching this Expression or (if that fails) the other one""" 49 assert isinstance(other, Expression), "RHS of '|' not an Expression" 50 return Alt( (self, other) )
51
52 - def group_names(self):
53 """the list of group names used by this Expression and its children""" 54 return ()
55
56 - def _find_groups(self, tag):
57 """return a list of all groups matching the given tag""" 58 return []
59
60 - def features(self):
61 """return a list of all features""" 62 return []
63
64 - def _select_names(self, names):
65 """internal function used by 'select_names'. 66 67 Don't call this function. Will likely be removed in future versions. 68 """ 69 # subtrees can be shared so you need to copy first before selecting 70 pass
71
72 - def copy(self):
73 """do a deep copy on this Expression tree""" 74 raise NotImplementedError
75
76 - def __str__(self):
77 """the corresponding pattern string""" 78 raise NotImplementedError
79
80 - def make_parser(self, debug_level = 0):
81 """create a SAX compliant parser for this regexp""" 82 import Generate 83 tagtable, want_flg, attrlookup = Generate.generate(self, debug_level) 84 return Parser.Parser(tagtable, (want_flg, debug_level, attrlookup))
85
86 - def make_iterator(self, tag = "record", debug_level = 0):
87 """create an iterator for this regexp; the 'tag' defines a record""" 88 import Iterator 89 return Iterator.Iterator(self.make_parser(debug_level), tag)
90
91 - def _modify_leaves(self, func):
92 """internal function for manipulating the leaves of an expression 93 94 This really needs to be some sort of visit pattern, but I'm 95 not sure the best way to do it. THIS METHOD MAY CHANGE. 96 """ 97 return func(self)
98 99 # Any character in a given set: '[abc]'
100 -class Any(Expression):
101 - def __init__(self, chars, invert = 0):
102 """(chars, invert = 0) 103 104 Match any of the characters appearing in the 'chars' string. 105 If 'invert' is true, match a character not in the string. 106 """ 107 self.chars = chars 108 self.invert = invert
109
110 - def copy(self):
111 """do a deep copy on this Expression tree""" 112 return Any(self.chars, self.invert)
113
114 - def __str__(self):
115 """the corresponding pattern string""" 116 if self.invert: 117 return '[^%s]' % _minimize_any_range(self.chars) 118 else: 119 return '[%s]' % _minimize_any_range(self.chars)
120 121 # Lookahead assertions: '(?=...)' 122 # '(?!...)'
123 -class Assert(Expression):
124 - def __init__(self, expression, invert = 0):
125 """(expression, invert = 0) 126 127 A non-consuming assertion using the given expression. 128 The default is a positive lookahead, which matches if the expression 129 matches at the current position, but does not affect the character 130 position. 131 132 If 'invert' is false, this is a negative lookahead assertion, 133 and matches if the expression does not match. Again, the character 134 position is not affected. 135 """ 136 self.expression = expression 137 self.invert = invert
138
139 - def copy(self):
140 """do a deep copy on this Expression tree""" 141 return Assert(self.expression.copy(), self.invert)
142
143 - def __str__(self):
144 """the corresponding pattern string""" 145 if self.invert: 146 return '(?!%s)' % str(self.expression) 147 else: 148 return '(?=%s)' % str(self.expression)
149
150 - def _modify_leaves(self, func):
151 exp = self.expression._modify_leaves(func) 152 assert exp is not None 153 self.expression = exp 154 return self
155 156 # At the beginning of the string: '^' in multiline mode 157 # 158 # There should be no reason to use this object because the Martel 159 # grammers all use explicit newlines anyway. 160
161 -class AtBeginning(Expression):
162 """Match the beginning of a line"""
163 - def copy(self):
164 """do a deep copy on this Expression tree""" 165 return AtBeginning()
166 - def __str__(self):
167 """the corresponding pattern string""" 168 return '^'
169 170 # At the end of the string: '$' in multiline mode 171 # 172 # There should be no reason to use this object because the Martel 173 # grammers all use explicit newlines anyway.
174 -class AtEnd(Expression):
175 """Match the end of a line"""
176 - def copy(self):
177 """do a deep copy on this Expression tree""" 178 return AtEnd()
179 - def __str__(self):
180 """the corresponding pattern string""" 181 return '$'
182 183 # Print a message when there is a match at this point. 184 # Helpful for debugging
185 -class Debug(Expression):
186 - def __init__(self, msg):
187 self.msg = msg
188 - def __str__(self):
189 # There is no pattern for this 190 return ""
191 - def copy(self):
192 """do a deep copy on this Expression""" 193 return Debug(self.msg)
194 195 # Any character except newline: '.'
196 -class Dot(Expression):
197 """Match any character except newline"""
198 - def copy(self):
199 """do a deep copy on this Expression tree""" 200 return Dot()
201 - def __str__(self):
202 """the corresponding pattern string""" 203 return '.'
204 205 # Read any one of the newline conventions
206 -class AnyEol(Expression):
207 """Match a newline ("\n", "\r" or "\r\n")"""
208 - def copy(self):
209 """do a deep copy on this Expression tree""" 210 return AnyEol()
211 - def __str__(self):
212 """the corresponding pattern string""" 213 return r"(\n|\r\n?)"
214 215 # A grouping: '(?P<name>expression)' 216 # '(?:expression)' 217 # '(expression)' -- same as (?:...) since I don't track 218 # or use the group number 219 220 # Group names must be valid XML identifiers
221 -def _verify_name(s):
222 assert s, "Group name can not be the empty string" 223 if not msre_parse.isname(s): 224 raise AssertionError, "Illegal character in group name %s" % repr(s)
225 226 _fast_quote_lookup = None
227 -def _make_fast_lookup():
228 global _fast_quote_lookup 229 230 safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 231 'abcdefghijklmnopqrstuvwxyz' 232 '0123456789' '_.-') 233 lookup = {} 234 for c in range(256): 235 lookup[chr(c)] = '%%%02X' % c 236 for c in safe: 237 lookup[c] = c 238 239 _fast_quote_lookup = lookup
240
241 -def _quote(s):
242 if _fast_quote_lookup is None: 243 _make_fast_lookup() 244 lookup = _fast_quote_lookup 245 terms = [] 246 if s: 247 for c in s: 248 terms.append(lookup[c]) 249 return string.join(terms, "")
250
251 -def _make_group_pattern(name, expression, attrs):
252 if name is None: 253 return '(%s)' % str(expression) 254 255 elif attrs: 256 # Convert them to the proper URL-encoded form 257 terms = [] 258 for k, v in attrs.items(): 259 terms.append("%s=%s" % (_quote(k), _quote(v))) 260 attrname = name + "?" + string.join(terms, "&") 261 return '(?P<%s>%s)' % (attrname, str(expression)) 262 else: 263 return '(?P<%s>%s)' % (name, str(expression))
264
265 -class Group(Expression):
266 - def __init__(self, name, expression, attrs = None):
267 """(name, expression) 268 269 Create a group named 'name' which matches the given expression 270 """ 271 if name is not None: 272 _verify_name(name) 273 self.name = name 274 self.expression = expression 275 if attrs is None: 276 attrs = xmlreader.AttributesImpl({}) 277 elif isinstance(attrs, type({})): 278 attrs = xmlreader.AttributesImpl(attrs) 279 self.attrs = attrs
280
281 - def group_names(self):
282 """the list of group names used by this Expression and its children""" 283 subnames = self.expression.group_names() 284 if self.name is not None: 285 if self.name in subnames: 286 return subnames 287 else: 288 return (self.name, ) + subnames 289 return subnames
290
291 - def _find_groups(self, tag):
292 """return a list of all groups matching the given tag""" 293 x = [] 294 if self.name == tag: 295 x.append(self) 296 return x + self.expression._find_groups(tag)
297
298 - def features(self):
299 """return a list of all features""" 300 return self.expression.features()
301
302 - def _select_names(self, names):
303 """internal function: do not use""" 304 if self.name is not None and self.name not in names: 305 self.name = None 306 self.attrs = xmlreader.AttributesImpl({}) 307 self.expression._select_names(names)
308
309 - def _modify_leaves(self, func):
310 exp = self.expression._modify_leaves(func) 311 assert exp is not None 312 self.expression = exp 313 return self
314
315 - def copy(self):
316 """do a deep copy on this Expression tree""" 317 return Group(self.name, self.expression.copy(), self.attrs.copy())
318
319 - def __str__(self):
320 """the corresponding pattern string""" 321 return _make_group_pattern(self.name, self.expression, self.attrs)
322 323 324 # group reference: '(?P<name>.)(?P=name)'
325 -class GroupRef(Expression):
326 - def __init__(self, name):
327 """(name) 328 329 Match the same text previously found by the given named group 330 """ 331 _verify_name(name) 332 self.name = name
333 - def copy(self):
334 """do a deep copy on this Expression tree""" 335 return GroupRef(self.name)
336 - def __str__(self):
337 """the corresponding pattern string""" 338 return "(?P=%s)" % self.name
339 340 341 342 # A single character: 'a' 343 # This exists to simplify the sre conversion, but is not directly exposed 344 # as part of Martel's public expression API
345 -class Literal(Expression):
346 - def __init__(self, char, invert = 0):
347 """(char, invert = 0) 348 349 Match the given character or, if 'invert' is true, match a character 350 which is not this character. 351 """ 352 self.char = char 353 self.invert = invert
354
355 - def copy(self):
356 """do a deep copy on this Expression tree""" 357 return Literal(self.char, self.invert)
358
359 - def __str__(self):
360 """the corresponding pattern string""" 361 c = escape(self.char) 362 if self.invert: 363 return '[^%s]' % c 364 return c
365 366 367 # Greedy repeat: 'a*' 368 # 'a{3,5}' 369 # 'a+'
370 -class MaxRepeat(Expression):
371 - def __init__(self, expression, min_count=0, 372 max_count=MAXREPEAT):
373 """(expression, min_count = 0, max_count = MAXREPEAT) 374 375 Match the expression at least 'min_count' times and no more 376 than 'max_count' times. If max_count == MAXREPEAT then 377 there is no fixed upper limit. 378 379 min_count and max_count can be strings, in which case they are 380 used as "named group repeats." That is, they are taken to be 381 group names and used to find the repeat counts during 382 evaluation time. The current implementation only understands 383 named group repeats when min_count == max_count. 384 385 The grouping is greedy. 386 387 WARNING: There is no check to ensure that a match of 0 size is 388 repeated indefinitely, as with "(a?)*" against the string "b". 389 This will loop forever. 390 391 WARNING: The current implementation does not support 392 backtracking in MaxRepeats, so ".*\n" will not match "\n". 393 Use a more explicit construct instead, like "[^\n]*\n". 394 395 """ 396 397 self.expression = expression 398 399 # Do some range checking 400 if type(min_count) == type(0): 401 assert 0 <= min_count, \ 402 "min_count must be non-negative, not %d" % min_count 403 if type(max_count) == type(0): 404 assert min_count <= max_count, \ 405 "min_count (%d) must be <= max_count (%d)" % (min_count, max_count) 406 else: 407 if type(max_count) == type(0): 408 assert max_count >= 0, \ 409 "max_count must be non-negative, not %d" % max_count 410 411 self.min_count = min_count 412 self.max_count = max_count
413
414 - def group_names(self):
415 """the list of group names used by this Expression and its children""" 416 # These are the names created by this Expression, not the names 417 # *used* by it. 418 return self.expression.group_names()
419 - def _find_groups(self, tag):
420 return self.expression._find_groups(tag)
421
422 - def features(self):
423 """return a list of all features""" 424 return self.expression.features()
425
426 - def copy(self):
427 """do a deep copy on this Expression tree""" 428 return MaxRepeat(self.expression.copy(), self.min_count, 429 self.max_count)
430
431 - def _select_names(self, names):
432 """internal function: do not use""" 433 self.expression._select_names(names)
434
435 - def _modify_leaves(self, func):
436 exp = self.expression._modify_leaves(func) 437 assert exp is not None 438 self.expression = exp 439 return self
440
441 - def __str__(self):
442 """the corresponding pattern string""" 443 min_count = self.min_count 444 max_count = self.max_count 445 subexp = self.expression 446 447 # If the subexpression is an Alt or Seq, then I need to put 448 # them inside their own group, since "(a|b)*" is not the same 449 # as "a|b*" and "ab*" is not the same as "(ab)*". 450 if isinstance(subexp, ExpressionList): 451 need_group = 1 452 elif isinstance(subexp, Str): 453 # Strings are also special, since it's a special case 454 # of Seq( (Literal(s[0]), Literal(s[1]), ... ) ) 455 need_group = 1 456 else: 457 need_group = 0 458 459 if need_group: 460 s = "(%s)" % str(subexp) 461 else: 462 s = str(subexp) 463 464 # Find the "extension" put at the end of the expression string 465 if type(min_count) == type("") or type(max_count) == type(""): 466 # Handle named group repeats 467 if min_count == max_count: 468 ext = "{%s}" % min_count 469 else: 470 ext = "{%s,%s}" % (min_count, max_count) 471 else: 472 # Make things pretty by using the special regexp pattern notation 473 if min_count == 0 and max_count == MAXREPEAT: 474 ext = "*" 475 elif min_count == 1 and max_count == MAXREPEAT: 476 ext = "+" 477 elif min_count == 0 and max_count == 1: 478 ext = "?" 479 elif min_count == max_count == 1: 480 ext = "" 481 elif min_count == max_count: 482 ext = "{%d}" % max_count 483 elif min_count == 0: 484 ext = "{,%d}" % max_count 485 elif max_count == MAXREPEAT: 486 ext = "{%d,}" % min_count 487 else: 488 ext = "{%d,%d}" % (min_count, max_count) 489 490 return s + ext
491 492 # does nothing
493 -class NullOp(Expression):
494 - def __init__(self):
495 """() 496 497 Doesn't match anything. This is a null operation. It's 498 useful if you want a valid initial object from which to build, 499 as in: 500 501 exp = NullOp() 502 for c in string.split(line): 503 exp = exp + Str(c) 504 505 (That's contrived -- see Time.py for a real use.) 506 """
507 - def _select_names(self, names):
508 pass
509 - def copy(self):
510 return NullOp()
511 - def __str__(self):
512 return ""
513 - def __add__(self, other):
514 return other
515 - def __or__(self, other):
516 raise TypeError("Cannot 'or' a NullOp with anything (only 'and')")
517 518 519 # Match the subexpression.
520 -class PassThrough(Expression):
521 - def __init__(self, expression):
522 """(expression) 523 524 Match the given subexpression. This class should not be used 525 directly. It is meant for generating specialized parsers which 526 read a record at a time. 527 """ 528 self.expression = expression
529 - def _select_names(self, names):
530 self.expression._select_names(names)
531 - def _modify_leaves(self, func):
532 exp = self.expression._modify_leaves(func) 533 assert exp is not None 534 self.expression = exp 535 return self
536 - def copy(self):
537 """do a deep copy on this Expression tree""" 538 return PassThrough(self.expression.copy())
539 - def __str__(self):
540 """the corresponding pattern string""" 541 return str(self.expression)
542 - def group_names(self):
543 return self.expression.group_names()
544 - def _find_groups(self, tag):
545 return self.expression._find_groups(tag)
546 - def features(self):
547 """return a list of all features""" 548 return self.expression.features()
549
550 -class FastFeature(PassThrough):
551 - def __init__(self, expression, feature, remove_tags):
552 PassThrough.__init__(self, expression) 553 self.feature = feature 554 self.remove_tags = remove_tags
555 - def copy(self):
556 """do a deep copy on this Expression tree""" 557 return FastFeature(self.expression.copy(), self.feature, 558 self.remove_tags[:])
559 - def features(self):
560 return [(self.feature, self.remove_tags)]
561
562 -class HeaderFooter(PassThrough):
563 - def __init__(self, format_name, attrs, 564 header_expression, make_header_reader, header_args, 565 record_expression, make_record_reader, record_args, 566 footer_expression, make_footer_reader, footer_args):
567 # I added attrs to the parameter list but couldn't make it 568 # backwards compatible. Without this check, it's possible to 569 # have the object constructed seemingly okay then have the 570 # error appear downstream, making it hard to track down. 571 if isinstance(attrs, Expression): 572 raise TypeError("Looks like you need an attrs between the format_name and the record_expression") 573 574 575 if header_expression is None: 576 assert make_header_reader is None and header_args is None 577 exp = MaxRepeat(record_expression, 1) 578 else: 579 exp = header_expression + MaxRepeat(record_expression, 1) 580 if footer_expression is not None: 581 exp = exp + footer_expression 582 else: 583 assert make_footer_reader is None and footer_args is None 584 PassThrough.__init__(self, Group(format_name, exp, attrs)) 585 586 self.format_name = format_name 587 if attrs is None: 588 attrs = xmlreader.AttributesImpl({}) 589 elif isinstance(attrs, type({})): 590 attrs = xmlreader.AttributesImpl(attrs) 591 self.attrs = attrs 592 self.header_expression = header_expression 593 self.make_header_reader = make_header_reader 594 self.header_args = header_args 595 self.record_expression = record_expression 596 self.make_record_reader = make_record_reader 597 self.record_args = record_args 598 self.footer_expression = footer_expression 599 self.make_footer_reader = make_footer_reader 600 self.footer_args = footer_args
601
602 - def copy(self):
603 header_exp = self.header_expression 604 if header_exp is not None: header_exp = header_exp.copy() 605 606 record_exp = self.record_expression 607 if record_exp is not None: record_exp = record_exp.copy() 608 609 footer_exp = self.footer_expression 610 if footer_exp is not None: footer_exp = footer_exp.copy() 611 612 return HeaderFooter( 613 self.format_name, self.attrs.copy(), 614 header_exp, self.make_header_reader, self.header_args, 615 record_exp, self.make_record_reader, self.record_args, 616 footer_exp, self.make_footer_reader, self.footer_args)
617
618 - def _modify_leaves(self, func):
619 header_exp = self.header_expression 620 if header_exp is not None: 621 header_exp = header_exp.modify_leaves(func) 622 assert header_exp is not None 623 self.header_expression = header_exp 624 record_exp = self.record_expression 625 if record_exp is not None: 626 record_exp = record_exp.modify_leaves(func) 627 assert record_exp is not None 628 self.record_expression = record_exp 629 footer_exp = self.footer_expression 630 if footer_exp is not None: 631 footer_exp = footer_exp.modify_leaves(func) 632 assert footer_exp is not None 633 self.footer_expression = footer_exp 634 return self
635
636 - def make_parser(self, debug_level = 0):
637 import Generate, RecordReader 638 want = 0 639 if self.header_expression is not None: 640 header_tagtable, want_flg, attrlookup = \ 641 Generate.generate(self.header_expression, 642 debug_level = debug_level) 643 make_header_reader = self.make_header_reader 644 header_args = self.header_args 645 else: 646 header_tagtable = () 647 want_flg = 0 648 attrlookup = {} 649 make_header_reader = None, 650 header_args = None 651 652 653 record_tagtable, want_flag, tmp_attrlookup = \ 654 Generate.generate(self.record_expression, 655 debug_level = debug_level) 656 make_record_reader = self.make_record_reader 657 record_args = self.record_args 658 attrlookup.update(tmp_attrlookup) 659 660 want = want or want_flg 661 662 if self.footer_expression is not None: 663 footer_tagtable, want_flag, tmp_attrlookup = \ 664 Generate.generate(self.footer_expression, 665 debug_level = debug_level) 666 make_footer_reader = self.make_footer_reader 667 footer_args = self.footer_args 668 attrlookup.update(tmp_attrlookup) 669 else: 670 footer_tagtable = () 671 want_flg = 0 672 make_footer_reader = None 673 footer_args = None 674 675 want = want or want_flg 676 677 return Parser.HeaderFooterParser( 678 self.format_name, self.attrs, 679 make_header_reader, header_args, header_tagtable, 680 make_record_reader, record_args, record_tagtable, 681 make_footer_reader, footer_args, footer_tagtable, 682 (want, debug_level, attrlookup))
683
684 - def make_iterator(self, tag, debug_level = 0):
685 """create an iterator for this regexp; the 'tag' defines a record""" 686 import Iterator 687 if tag == self.format_name: 688 return self.expression.make_iterator(self, tag) 689 690 if self.header_expression is None: 691 header_parser = None 692 else: 693 header_parser = self.header_expression.make_parser(debug_level) 694 695 assert self.record_expression is not None 696 record_parser = self.record_expression.make_parser(debug_level) 697 698 if self.footer_expression is None: 699 footer_parser = None 700 else: 701 footer_parser = self.footer_expression.make_parser(debug_level) 702 703 if isinstance(self.record_expression, Group) and \ 704 self.record_expression.name == tag and \ 705 IterParser is not None: 706 # There's an optimization for this case 707 return IterParser.IterHeaderFooter( 708 header_parser, self.make_header_reader, self.header_args, 709 record_parser, self.make_record_reader, self.record_args, 710 footer_parser, self.make_footer_reader, self.footer_args, 711 tag 712 ) 713 714 return Iterator.IteratorHeaderFooter( 715 header_parser, self.make_header_reader, self.header_args, 716 record_parser, self.make_record_reader, self.record_args, 717 footer_parser, self.make_footer_reader, self.footer_args, 718 tag 719 )
720
721 - def group_names(self):
722 x = [self.format_name] 723 if self.header_expression is not None: 724 x.extend(self.header_expression.group_names()) 725 x.extend(self.expression.group_names()) 726 if self.footer_expression is not None: 727 x.extend(self.footer_expression.group_names()) 728 return x
729 - def _find_groups(self, tag):
730 assert tag != self.format_name, "can't handle that case" 731 x = [] 732 if self.header_expression is not None: 733 x.extend(self.header_expression._find_groups(tag)) 734 x.extend(self.expression._find_groups(tag)) 735 if self.footer_expression is not None: 736 x.extend(self.footer_expression._find_groups(tag)) 737 return x
738
739 - def features(self):
740 """return a list of all features""" 741 x = [] 742 if self.header_expression is not None: 743 x.extend(self.header_expression.features()) 744 x.extend(self.expression.features()) 745 if self.footer_expression is not None: 746 x.extend(self.footer_expression.features()) 747 return x
748 749 # Might be useful to allow a minimum record count (likely either 0 or 1)
750 -class ParseRecords(PassThrough):
751 - def __init__(self, format_name, attrs, record_expression, 752 make_reader, reader_args = ()):
753 PassThrough.__init__(self, Group(format_name, 754 MaxRepeat(record_expression, 1), 755 attrs)) 756 757 # I added attrs to the parameter list but couldn't make it 758 # backwards compatible. Without this check, it's possible to 759 # have the object constructed seemingly okay then have the 760 # error appear downstream, making it hard to track down. 761 if isinstance(attrs, Expression): 762 raise TypeError("Looks like you need an attrs between the format_name and the record_expression") 763 764 self.format_name = format_name 765 if attrs is None: 766 attrs = xmlreader.AttributesImpl({}) 767 elif isinstance(attrs, type({})): 768 attrs = xmlreader.AttributesImpl(attrs) 769 self.attrs = attrs 770 self.record_expression = record_expression 771 self.make_reader = make_reader 772 self.reader_args = reader_args
773 - def copy(self):
774 """do a deep copy on this Expression tree""" 775 return ParseRecords(self.format_name, self.attrs, 776 self.record_expression.copy(), 777 self.make_reader, self.reader_args)
778
779 - def make_parser(self, debug_level = 0):
780 import Generate 781 tagtable, want_flg, attrlookup = Generate.generate( 782 self.record_expression, debug_level) 783 784 return Parser.RecordParser(self.format_name, self.attrs, 785 tagtable, (want_flg, debug_level, attrlookup), 786 self.make_reader, self.reader_args)
787
788 - def make_iterator(self, tag, debug_level = 0):
789 """create an iterator for this regexp; the 'tag' defines a record""" 790 import Iterator 791 if tag == self.format_name: 792 return self.expression.make_iterator(self, tag) 793 794 if isinstance(self.record_expression, Group) and \ 795 self.record_expression.name == tag and \ 796 IterParser is not None: 797 # There's an optimization for this case 798 return IterParser.IterRecords( 799 self.record_expression.make_parser(debug_level), 800 self.make_reader, self.reader_args, tag) 801 802 return Iterator.IteratorRecords( 803 self.record_expression.make_parser(debug_level), 804 self.make_reader, self.reader_args, tag)
805 806
807 - def group_names(self):
808 return self.format_name + self.expression.group_names()
809 - def _find_groups(self, tag):
810 assert tag != self.format_name, "can't handle that case" 811 return self.expression._find_groups(tag)
812
813 - def features(self):
814 """return a list of all features""" 815 return self.expression.features()
816
817 - def _modify_leaves(self, func):
818 exp = self.expression.modify_leaves(func) 819 assert exp is not None 820 return self
821 822 823 # A sequence of characters: 'abcdef'
824 -class Str(Expression):
825 - def __init__(self, s):
826 """(s) 827 828 Match the given string exactly (not as a regexp pattern) 829 """ 830 self.string = s
831
832 - def copy(self):
833 """do a deep copy on this Expression tree""" 834 return Str(self.string)
835
836 - def __str__(self):
837 """the corresponding pattern string""" 838 return escape(self.string)
839 840 841
842 -class ExpressionList(Expression):
843 """shares implementation used by 'Expressions with subexpressions'"""
844 - def group_names(self):
845 """the list of group names used by this Expression or its children""" 846 names = {} 847 for exp in self.expressions: 848 for name in exp.group_names(): 849 names[name] = 1 850 return tuple(names.keys())
851 - def _find_groups(self, tag):
852 x = [] 853 for exp in self.expressions: 854 x.extend(exp._find_groups(tag)) 855 return x
856 - def features(self):
857 """return a list of all features""" 858 x = [] 859 for exp in self.expressions: 860 x.extend(exp.features()) 861 return x
862
863 - def _select_names(self, names):
864 """internal function. Do not use.""" 865 for exp in self.expressions: 866 exp._select_names(names)
867 - def copy(self):
868 """do a deep copy on this Expression tree""" 869 return self.__class__(map(lambda x: x.copy(), self.expressions))
870 - def _modify_leaves(self, func):
871 new_expressions = [] 872 for exp in self.expressions: 873 new_expressions.append(exp._modify_leaves(func)) 874 assert None not in new_expressions 875 self.expressions = tuple(new_expressions) 876 return self
877 878 # A set of expressions: 'a|b|c'
879 -class Alt(ExpressionList):
880 """An Expression tree with a list of alternate matches. 881 882 """
883 - def __init__(self, expressions):
884 """(expressions) 885 886 Match one of a list of alternate expressions. The expressions are 887 tested in their input order. 888 889 For example, Alt( (exp1, exp2, exp3) ) means try to match exp1, 890 and if that fails try to match exp2, and if that fails, try to 891 match exp3. If *that* fails, the match failed. 892 893 """ 894 if isinstance(expressions, type( [] )): 895 expressions = tuple(expressions) 896 elif isinstance(expressions, Expression): 897 raise TypeError("Must pass in a list of expressions, not just a single one (put it inside of ()s") 898 self.expressions = expressions
899
900 - def __or__(self, other):
901 # If the other is also an Alt, I can simplify things by 902 # merging together the two lists of subexpressions. 903 if isinstance(other, Alt): 904 # This is why I convert lists to tuples; I need a 905 # homogenous list type for addition. I chose tuples to 906 # help enforce the idea that Expressions should not be 907 # changed after they are created. 908 return Alt(self.expressions + other.expressions) 909 else: 910 return Alt(self.expressions + (other,))
911
912 - def __str__(self):
913 """the corresponding pattern string""" 914 return string.join(map(str, self.expressions), '|')
915 916 917 # A sequence of expressions: '[ab][cd][ef]'
918 -class Seq(ExpressionList):
919 """An Expression matching a set of subexpressions, in sequential order"""
920 - def __init__(self, expressions):
921 """(expressions) 922 923 Match the list of sequential expressions, in order. Each 924 expression starts matching at the point where the previous 925 match finished. 926 """ 927 if isinstance(expressions, type( [] )): 928 # See 'Alt' for why I'm converting to tuples 929 expressions = tuple(expressions) 930 elif isinstance(expressions, Expression): 931 raise TypeError("Must pass in a list of expressions, not just a single one (put it inside of ()s") 932 self.expressions = expressions
933
934 - def __add__(self, other):
935 # Optimize the case of Seq by combining the lists 936 if isinstance(other, Seq): 937 return Seq(self.expressions + other.expressions) 938 return Seq(self.expressions + (other,))
939
940 - def __str__(self):
941 """the corresponding pattern string""" 942 # Seq is the lowest priority, so put parens around Alt subexpressions 943 patterns = [] 944 for exp in self.expressions: 945 pattern = str(exp) 946 if isinstance(exp, Alt): 947 patterns.append('(%s)' % pattern) 948 else: 949 patterns.append(pattern) 950 return string.join(patterns, "")
951 952 953 ########## Support code for making the pattern string for an expression 954 955 # taken from re.escape, except also don't escape " ="
956 -def escape(pattern):
957 "Escape all non-alphanumeric characters in pattern." 958 result = list(pattern) 959 alphanum=string.letters+'_'+string.digits+" =" 960 for i in range(len(pattern)): 961 char = pattern[i] 962 if char not in alphanum: 963 if char=='\000': result[i] = '\\000' 964 else: result[i] = '\\'+char 965 return string.join(result, '')
966 967 # Escapes for common control characters 968 _minimize_escape_chars = { 969 "\a": r"\a", 970 "\b": r"\b", 971 "\n": r"\n", 972 "\r": r"\r", 973 "\t": r"\t", 974 "\f": r"\f", 975 "\v": r"\v", 976 "[": "\\[", 977 "]": "\\]", 978 "\\": "\\\\", 979 "^": "\^", 980 } 981
982 -def _minimize_escape_char(c):
983 """(c) -> into an appropriately escaped pattern for the character""" 984 x = _minimize_escape_chars.get(c) 985 if x is None: 986 if ord(c) < 32: 987 return '\\' + c 988 return c 989 return x
990
991 -def _minimize_escape_range(c1, c2):
992 """(c1, c2) -> the pattern for the range bounded by those two characters""" 993 # Called when 2 or more successive characters were found in a row. 994 # c1 is the first character in the range and c2 is the last. 995 996 if ord(c1) + 1 == ord(c2): 997 # Two characters in a row. Doesn't make sense to use the '-' 998 # notation, so 999 return _minimize_escape_char(c1) + _minimize_escape_char(c2) 1000 1001 # Special case for numbers 1002 if c1 == '0' and c2 == '9': 1003 return r"\d" 1004 1005 # Otherwise just use the '-' range. 1006 return _minimize_escape_char(c1) + '-' + _minimize_escape_char(c2)
1007 1008
1009 -def _minimize_any_range(s):
1010 """s -> a string useable inside [] which matches all the characters in s 1011 1012 For example, passing in "0123456789" returns "\d". 1013 1014 This code isn't perfect. 1015 """ 1016 if not(isinstance(s, type(""))): 1017 s = str(s) 1018 if not s: 1019 return s 1020 1021 # Treat the '-' special since it must occur last. 1022 # However, this means '!"#....xyz' gets turned into '!-,.-z-' 1023 has_hyphen = 0 1024 if '-' in s: 1025 has_hyphen = 1 1026 s = string.replace(s, "-", "") 1027 1028 # Get the ordered list of characters in the string 1029 chars = list(s) 1030 chars.sort() 1031 unique = [] 1032 prev_c = None 1033 for c in chars: 1034 if c != prev_c: 1035 unique.append(c) 1036 prev_c = c 1037 1038 s = string.join(unique, "") 1039 t = "" 1040 prev = None 1041 prev_pos = 0 1042 pos = 0 1043 1044 # Join successive characters which are in ASCII order 1045 # Eg, "abcdef" gets turned into "a-f" 1046 for c in unique: 1047 val = ord(c) 1048 1049 if val - 1 != prev: 1050 # either beginning of string or non-sequential 1051 if prev is None: 1052 # beginning of string 1053 prev_pos = 0 1054 else: 1055 # non-sequential 1056 # Create the string for the previous range. 1057 if prev_pos == pos - 1: 1058 # If there was one character in the range, use it 1059 t = t + _minimize_escape_char(s[prev_pos]) 1060 else: 1061 # Two or more characters in a row define a range 1062 t = t + _minimize_escape_range(s[prev_pos], s[pos-1]) 1063 prev_pos = pos 1064 else: 1065 # Still part of the same sequence, so just advance in the string 1066 pass 1067 1068 prev = val 1069 pos = pos + 1 1070 1071 # Handle the final sequence block 1072 if s: 1073 if prev_pos == pos - 1: 1074 t = t + _minimize_escape_char(s[prev_pos]) 1075 else: 1076 t = t + _minimize_escape_range(s[prev_pos], s[pos-1]) 1077 else: 1078 # Get this case if there was no text except for the hyphen character 1079 pass 1080 1081 # Put the hyphen back on the end 1082 if has_hyphen: 1083 t = t + '-' 1084 1085 # Simple fixes for fields that annoy me a lot 1086 conversions = { 1087 "\\dA-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\xff": r"\w", 1088 } 1089 t = conversions.get(t, t) 1090 1091 return t
1092
1093 -def _make_no_case(node):
1094 """modify an expression in place to remove case dependencies 1095 1096 may return a new top-level node 1097 """ 1098 if isinstance(node, Str): 1099 x = NullOp() 1100 s = "" 1101 for c in node.string: 1102 up_c = string.upper(c) 1103 low_c = string.lower(c) 1104 assert c in (up_c, low_c), "how can this be?" 1105 if up_c == low_c: 1106 s = s + c 1107 else: 1108 if s: 1109 x = x + Str(s) 1110 s = "" 1111 x = x + Any(up_c + low_c) 1112 if s: 1113 x = x + Str(s) 1114 return x 1115 1116 if isinstance(node, Any): 1117 s = node.chars 1118 chars = {} 1119 for c in s: 1120 chars[c] = 1 1121 for c in string.upper(s) + string.lower(s): 1122 if not chars.has_key(c): 1123 chars[c] = 1 1124 s = s + c 1125 return Any(s, node.invert) 1126 1127 if isinstance(node, Literal): 1128 c = node.char 1129 up_c = string.upper(c) 1130 low_c = string.lower(c) 1131 if up_c == low_c: 1132 return node 1133 return Any(up_c + low_c, node.invert) 1134 1135 return node
1136
1137 -def NoCase(expr):
1138 """expression -> expression where the text is case insensitive""" 1139 expr = expr.copy() 1140 return expr._modify_leaves(_make_no_case)
1141