1
2
3
4
5
6 """Code to support writing parsers.
7
8
9
10 Classes:
11 AbstractParser Base class for parsers.
12 AbstractConsumer Base class of all Consumers.
13 TaggingConsumer Consumer that tags output with its event. For debugging
14 SGMLStrippingConsumer Consumer that strips SGML tags from output.
15 EventGenerator Generate Biopython Events from Martel XML output
16
17 Functions:
18 safe_readline Read a line from a handle, with check for EOF.
19 safe_peekline Peek at next line, with check for EOF.
20 read_and_call Read a line from a handle and pass it to a method.
21 read_and_call_while Read many lines, as long as a condition is met.
22 read_and_call_until Read many lines, until a condition is met.
23 attempt_read_and_call Like read_and_call, but forgiving of errors.
24 is_blank_line Test whether a line is blank.
25
26 """
27
28 import sys
29 import string
30 import traceback
31 from types import *
32
33 from Bio import File
34
35
36 try:
37 from xml.sax import handler
38 xml_support = 1
39 except ImportError:
40 sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" +
41 "This causes problems with some ParserSupport modules\n")
42 xml_support = 0
43
45 """Base class for other parsers.
46
47 """
49 raise NotImplementedError, "Please implement in a derived class"
50
52 return self.parse(File.StringHandle(string))
53
55 h = open(filename)
56 try:
57 retval = self.parse(h)
58 finally:
59 h.close()
60 return retval
61
63 """Base class for other Consumers.
64
65 Derive Consumers from this class and implement appropriate
66 methods for each event that you want to receive.
67
68 """
79
81 """A Consumer that tags the data stream with the event and
82 prints it to a handle. Useful for debugging.
83
84 """
85 - def __init__(self, handle=None, colwidth=15, maxwidth=80):
86 """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)"""
87
88
89
90
91
92 if handle is None:
93 handle = sys.stdout
94 self._handle = handle
95 self._colwidth = colwidth
96 self._maxwidth = maxwidth
97
100
103
105 if data is None:
106
107 self._handle.write("%s %s\n" % ("*"*self._colwidth, name))
108 else:
109
110 self._handle.write("%-*s: %s\n" % (
111 self._colwidth, name[:self._colwidth],
112 string.rstrip(data[:self._maxwidth-self._colwidth-2])))
113
115 if attr[:6] == 'start_' or attr[:4] == 'end_':
116 method = lambda a=attr, s=self: s._print_name(a)
117 else:
118 method = lambda x, a=attr, s=self: s._print_name(a, x)
119 return method
120
122 """A consumer that strips off SGML tags.
123
124 This is meant to be used as a decorator for other consumers.
125
126 """
128 if type(consumer) is not InstanceType:
129 raise ValueError, "consumer should be an instance"
130 self._consumer = consumer
131 self._prev_attr = None
132 self._stripper = File.SGMLStripper()
133
137
139 if name in ['_prev_attr', '_stripper']:
140 return getattr(self, name)
141 attr = getattr(self._consumer, name)
142
143 if type(attr) is not MethodType:
144 return attr
145
146 if name[:6] == 'start_' or name[:4] == 'end_':
147 return attr
148
149 self._prev_attr = attr
150 return self._apply_clean_data
151
152
153 if xml_support:
155 """Handler to generate events associated with a Martel parsed file.
156
157 This acts like a normal SAX handler, and accepts XML generated by
158 Martel during parsing. These events are then converted into
159 'Biopython events', which can then be caught by a standard
160 biopython consumer
161 """
162 - def __init__(self, consumer, interest_tags, callback_finalizer = None,
163 exempt_tags = []):
164 """Initialize to begin catching and firing off events.
165
166 Arguments:
167 o consumer - The consumer that we'll send Biopython events to.
168
169 o interest_tags - A listing of all the tags we are interested in.
170
171 o callback_finalizer - A function to deal with the collected
172 information before passing it on to the consumer. By default
173 the collected information is a list of all of the lines read
174 for a particular tag -- if there are multiple tags in a row
175 like:
176
177 <some_info>Spam<some_info>
178 <some_info>More Spam<some_info>
179
180 In this case the list of information would be:
181
182 ['Spam', 'More Spam']
183
184 This list of lines will be passed to the callback finalizer if
185 it is present. Otherwise the consumer will be called with the
186 list of content information.
187
188 o exempt_tags - A listing of particular tags that are exempt from
189 being processed by the callback_finalizer. This allows you to
190 use a finalizer to deal with most tags, but leave those you don't
191 want touched.
192 """
193 self._consumer = consumer
194 self.interest_tags = interest_tags
195 self._finalizer = callback_finalizer
196 self._exempt_tags = exempt_tags
197
198
199
200
201
202 self.info = {}
203 for tag in self.interest_tags:
204 self.info[tag] = []
205
206
207
208
209
210 self._previous_tag = ''
211
212
213 self._cur_content = []
214
215 self._collect_characters = 0
216
218 """Determine if we should collect characters from this tag.
219 """
220 if name in self.interest_tags:
221 self._collect_characters = 1
222
224 """Extract the information if we are interested in it.
225 """
226 if self._collect_characters:
227 self._cur_content.append(content)
228
230 """Send the information to the consumer.
231
232 Once we've got the end element we've collected up all of the
233 character information we need, and we need to send this on to
234 the consumer to do something with it.
235
236 We have a delay of one tag on doing this, so that we can collect
237 all of the info from multiple calls to the same element at once.
238 """
239
240
241 if self._collect_characters:
242
243 self.info[name].append("".join(self._cur_content))
244
245 self._cur_content = []
246 self._collect_characters = 0
247
248
249 if self._previous_tag and self._previous_tag != name:
250 self._make_callback(self._previous_tag)
251
252
253 self._previous_tag = name
254
256 """Call the callback function with the info with the given name.
257 """
258
259 callback_function = getattr(self._consumer, name)
260
261
262
263 if self._finalizer is not None and name not in self._exempt_tags:
264 info_to_pass = self._finalizer(self.info[name])
265
266 else:
267 info_to_pass = self.info[name]
268
269 callback_function(info_to_pass)
270
271
272 self.info[name] = []
273
275 """Make sure all of our information has been passed.
276
277 This just flushes out any stored tags that need to be passed.
278 """
279 if self._previous_tag:
280 self._make_callback(self._previous_tag)
281
283 """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re])
284
285 Read a line from uhandle, check it, and pass it to the method.
286 Raises a ValueError if the line does not pass the checks.
287
288 start, end, contains, blank, and has_re specify optional conditions
289 that the line must pass. start and end specifies what the line must
290 begin or end with (not counting EOL characters). contains
291 specifies a substring that must be found in the line. If blank
292 is a true value, then the line must be blank. has_re should be
293 a regular expression object with a pattern that the line must match
294 somewhere.
295
296 """
297 line = safe_readline(uhandle)
298 errmsg = _fails_conditions(*(line,), **keywds)
299 if errmsg is not None:
300 raise ValueError, errmsg
301 method(line)
302
304 """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines
305
306 Read a line from uhandle and pass it to the method as long as
307 some condition is true. Returns the number of lines that were read.
308
309 See the docstring for read_and_call for a description of the parameters.
310
311 """
312 nlines = 0
313 while 1:
314 line = safe_readline(uhandle)
315
316 if _fails_conditions(*(line,), **keywds):
317 uhandle.saveline(line)
318 break
319 method(line)
320 nlines = nlines + 1
321 return nlines
322
324 """read_and_call_until(uhandle, method,
325 start=None, end=None, contains=None, blank=None) -> number of lines
326
327 Read a line from uhandle and pass it to the method until
328 some condition is true. Returns the number of lines that were read.
329
330 See the docstring for read_and_call for a description of the parameters.
331
332 """
333 nlines = 0
334 while 1:
335 line = safe_readline(uhandle)
336
337 if not _fails_conditions(*(line,), **keywds):
338 uhandle.saveline(line)
339 break
340 method(line)
341 nlines = nlines + 1
342 return nlines
343
345 """attempt_read_and_call(uhandle, method, **keywds) -> boolean
346
347 Similar to read_and_call, but returns a boolean specifying
348 whether the line has passed the checks. Does not raise
349 exceptions.
350
351 See docs for read_and_call for a description of the function
352 arguments.
353
354 """
355 line = safe_readline(uhandle)
356 passed = not _fails_conditions(*(line,), **keywds)
357 if passed:
358 method(line)
359 else:
360 uhandle.saveline(line)
361 return passed
362
363 -def _fails_conditions(line, start=None, end=None, contains=None, blank=None,
364 has_re=None):
365 if start is not None:
366 if line[:len(start)] != start:
367 return "Line does not start with '%s':\n%s" % (start, line)
368 if end is not None:
369 if string.rstrip(line)[-len(end):] != end:
370 return "Line does not end with '%s':\n%s" % (end, line)
371 if contains is not None:
372 if string.find(line, contains) == -1:
373 return "Line does not contain '%s':\n%s" % (contains, line)
374 if blank is not None:
375 if blank:
376 if not is_blank_line(line):
377 return "Expected blank line, but got:\n%s" % line
378 else:
379 if is_blank_line(line):
380 return "Expected non-blank line, but got a blank one"
381 if has_re is not None:
382 if has_re.search(line) is None:
383 return "Line does not match regex '%s':\n%s" % (
384 has_re.pattern, line)
385 return None
386
388 """is_blank_line(line, allow_spaces=0) -> boolean
389
390 Return whether a line is blank. allow_spaces specifies whether to
391 allow whitespaces in a blank line. A true value signifies that a
392 line containing whitespaces as well as end-of-line characters
393 should be considered blank.
394
395 """
396 if not line:
397 return 1
398 if allow_spaces:
399 return string.rstrip(line) == ''
400 return line[0] == '\n' or line[0] == '\r'
401
403 """safe_readline(handle) -> line
404
405 Read a line from an UndoHandle and return it. If there are no more
406 lines to read, I will raise a ValueError.
407
408 """
409 line = handle.readline()
410 if not line:
411 raise ValueError, "Unexpected end of stream."
412 return line
413
415 """safe_peekline(handle) -> line
416
417 Peek at the next line in an UndoHandle and return it. If there are no
418 more lines to peek, I will raise a ValueError.
419
420 """
421 line = handle.peekline()
422 if not line:
423 raise ValueError, "Unexpected end of stream."
424 return line
425