1
2
3
4
5
6 """Code for more fancy file handles.
7
8
9 Classes:
10 SGMLExtractorHandle File object that strips tags and returns content from specified
11 tags blocks.
12
13 SGMLExtractor Object that scans for specified SGML tag pairs, removes any inner tags
14 and returns the raw content.
15 For example the object SGMLExtractor( [ 'h1' ] )on the following html file would return
16 'House that Jack built'
17 SGMLExtractor( [ 'dt' ] ) would return 'ratcatdogcowmaiden'
18 SGMLExtractor( [ 'dt', 'dd' ] ) would return 'rat that ate the malttcat ate the rat' etc
19
20 <h1>House that Jack Built</h1>
21 <dl>
22 <dt><big>rat</big></dt>
23 <dd><big>ate the malt</big></dd>
24 <dt><big>cat</big></dt>
25 <dd><big>that ate the rat</big></dd>
26 <dt><big>dog</big></dt>
27 <dd><big>that worried the dats</big></dd>
28 <dt><big>cow</big></dt>
29 <dd><big>with crumpled horn</big></dd>
30 <dt><big>maiden</big></dt>
31 <dd><big>all forlorns</big></dd>
32 </dl>
33 """
34
35 import warnings
36 warnings.warn("Bio.SGMLExtractor was deprecated, as all Biopython modules that use Bio.SGMLExtractor have been deprecated. If you do use this module, please contact the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module")
37
38
39 import os
40 import string
41 import StringIO
42 import sgmllib
43
44
46 """A Python handle that automatically strips SGML tags and returns data from
47 specified tag start and end pairs.
48
49 """
50 - def __init__(self, handle, tags_of_interest = [] ):
51 """SGMLExtractor(handle, tags_of_interest )
52
53 handle is a file handle to SGML-formatted data.
54 tags_of_interest is a list of root names for pairs of start and end tags
55
56 """
57 self._handle = handle
58 self._stripper = SGMLExtractor( tags_of_interest )
59
60 - def read(self, *args, **keywds):
63
65 line = self._handle.readline( *args, **keywds)
66 return self._stripper.strip(line)
67
69 lines = self._handle.readlines( *args, **keywds)
70 for i in range(len(lines)):
71 lines[i] = self._stripper.strip(str)
72 return lines
73
75 return getattr(self._handle, attr)
76
77
79 if( len( items ) > 0 ):
80 return 0
81 else:
82 return 1
83
87 sgmllib.SGMLParser.__init__(self)
88 self.data = ''
89 self._instack = []
90 self._tags_of_interest = []
91 for tag in tags_of_interest:
92 self._tags_of_interest.append( tag.lower() )
93
97
99 lower_tag = tag.lower()
100 if( lower_tag in self._tags_of_interest ):
101 self._instack.append( lower_tag )
102
104 if( not is_empty( self._instack ) ):
105 open_tag = self._instack.pop()
106 try:
107 if( open_tag != tag.lower() ):
108 self._instack.append( open_tag )
109 except:
110 print tag
111
112
115
117 """S.strip(str) -> string
118
119 Strip the SGML tags from str.
120
121 """
122 if not str:
123 return ''
124
125
126
127
128
129 is_newline = str[-1] in ['\n', '\r']
130
131 self._parser.data = ''
132 self._parser.feed(str)
133 if self._parser.data:
134 str = self._parser.data
135 elif is_newline:
136 str = '\n'
137 return str
138