Package Bio :: Module SGMLExtractor
[hide private]
[frames] | no frames]

Source Code for Module Bio.SGMLExtractor

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code for more fancy file handles. 
  7   
  8   
  9  Classes: 
 10  SGMLExtractorHandle     File object that strips tags and returns content from specified 
 11  tags blocks. 
 12   
 13  SGMLExtractor   Object that scans for specified SGML tag pairs, removes any inner tags 
 14  and returns the raw content. 
 15  For example the object SGMLExtractor( [ 'h1' ] )on the following html file would return 
 16  'House that Jack built' 
 17  SGMLExtractor( [ 'dt' ] ) would return 'ratcatdogcowmaiden' 
 18  SGMLExtractor( [ 'dt', 'dd' ] ) would return 'rat that ate the malttcat ate  the rat' etc 
 19   
 20  <h1>House that Jack Built</h1> 
 21  <dl> 
 22    <dt><big>rat</big></dt> 
 23      <dd><big>ate the malt</big></dd> 
 24    <dt><big>cat</big></dt> 
 25      <dd><big>that ate the rat</big></dd> 
 26    <dt><big>dog</big></dt> 
 27      <dd><big>that worried the dats</big></dd> 
 28    <dt><big>cow</big></dt> 
 29      <dd><big>with crumpled horn</big></dd> 
 30    <dt><big>maiden</big></dt> 
 31      <dd><big>all forlorns</big></dd> 
 32  </dl> 
 33  """ 
 34   
 35  import warnings 
 36  warnings.warn("Bio.SGMLExtractor was deprecated, as all Biopython modules that use Bio.SGMLExtractor have been deprecated. If you do use this module, please contact the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module") 
 37   
 38   
 39  import os 
 40  import string 
 41  import StringIO 
 42  import sgmllib 
 43   
 44   
45 -class SGMLExtractorHandle:
46 """A Python handle that automatically strips SGML tags and returns data from 47 specified tag start and end pairs. 48 49 """
50 - def __init__(self, handle, tags_of_interest = [] ):
51 """SGMLExtractor(handle, tags_of_interest ) 52 53 handle is a file handle to SGML-formatted data. 54 tags_of_interest is a list of root names for pairs of start and end tags 55 56 """ 57 self._handle = handle 58 self._stripper = SGMLExtractor( tags_of_interest )
59
60 - def read(self, *args, **keywds):
61 data = self._handle.read( *args, **keywds) 62 return self._stripper.strip(data)
63
64 - def readline(self, *args, **keywds):
65 line = self._handle.readline( *args, **keywds) 66 return self._stripper.strip(line)
67
68 - def readlines(self, *args, **keywds):
69 lines = self._handle.readlines( *args, **keywds) 70 for i in range(len(lines)): 71 lines[i] = self._stripper.strip(str) 72 return lines
73
74 - def __getattr__(self, attr):
75 return getattr(self._handle, attr)
76 77
78 -def is_empty( items ):
79 if( len( items ) > 0 ): 80 return 0 81 else: 82 return 1
83
84 -class SGMLExtractor:
85 - class LocalParser(sgmllib.SGMLParser):
86 - def __init__(self, tags_of_interest = [] ):
87 sgmllib.SGMLParser.__init__(self) 88 self.data = '' 89 self._instack = [] 90 self._tags_of_interest = [] 91 for tag in tags_of_interest: 92 self._tags_of_interest.append( tag.lower() )
93
94 - def handle_data(self, data):
95 if( not is_empty( self._instack ) ): 96 self.data = self.data + data
97
98 - def unknown_starttag(self, tag, attrs):
99 lower_tag = tag.lower() 100 if( lower_tag in self._tags_of_interest ): 101 self._instack.append( lower_tag )
102
103 - def unknown_endtag(self, tag ):
104 if( not is_empty( self._instack ) ): 105 open_tag = self._instack.pop() 106 try: 107 if( open_tag != tag.lower() ): 108 self._instack.append( open_tag ) 109 except: 110 print tag
111 112
113 - def __init__(self, tags_of_interest = [] ):
114 self._parser = SGMLExtractor.LocalParser( tags_of_interest )
115
116 - def strip(self, str):
117 """S.strip(str) -> string 118 119 Strip the SGML tags from str. 120 121 """ 122 if not str: # empty string, don't do anything. 123 return '' 124 # I need to make sure that I don't return an empty string if 125 # the buffer is not empty. This can happen if there's a newline 126 # character embedded within a tag. Thus, I'll first check to 127 # see if the last character is a newline. If it is, and it's stripped 128 # away, I'll add it back. 129 is_newline = str[-1] in ['\n', '\r'] 130 131 self._parser.data = '' # clear the parser's data (don't reset) 132 self._parser.feed(str) 133 if self._parser.data: 134 str = self._parser.data 135 elif is_newline: 136 str = '\n' 137 return str
138