Package Bio :: Package CDD
[hide private]
[frames] | no frames]

Source Code for Package Bio.CDD

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Deal with Conserved Domain Database (CDD) entries from NCBI. 
  6  """ 
  7   
  8  import warnings 
  9  warnings.warn("Bio.CDD was deprecated, as it cannot parse recent HTML files from the CDD database. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 10   
 11   
 12  # standard library 
 13  import string 
 14  import array 
 15  import os 
 16  import re 
 17  import sgmllib 
 18  import urlparse 
 19   
 20   
 21  # XML from python 2.0 
 22  from xml.sax import handler 
 23   
 24  # Martel 
 25  import Martel 
 26  from Martel import RecordReader 
 27   
 28  from Bio.FilteredReader import FilteredReader 
 29  from Bio.FilteredReader import remove_empty_line 
 30  from Bio.FilteredReader import remove_leading_whitespace 
 31  from Bio.SGMLExtractor import SGMLExtractorHandle 
 32  from Bio import File 
 33  from Bio.Seq import Seq 
 34  from Martel.Dispatch import Dispatcher 
 35  import cdd_format 
 36  import Record 
 37   
38 -class Iterator:
39 """Iterator interface to move over a file of CDD entries one at a time. 40 Iterator expects a handle to an sgml file. It extracts data bracketed 41 by specified tag pairs, then removes blank lines and leading white space. 42 The parser operates on the filtered data. 43 """
44 - def __init__(self, handle, parser = None):
45 """Initialize the iterator. 46 47 Arguments: 48 o handle - A handle with CDD entries to iterate through. 49 o parser - An optional parser to pass the entries through before 50 returning them. If None, then the raw entry will be returned. 51 """ 52 record_handle = SGMLExtractorHandle( handle, [ 'title', 'table', ] ) 53 filtered_handle = FilteredReader( record_handle ) 54 filtered_handle.filter_chain = [ remove_empty_line, remove_leading_whitespace ] 55 self.handle = File.UndoHandle( filtered_handle ) 56 self._reader = RecordReader.Everything( self.handle ) 57 self._parser = parser
58
59 - def next(self):
60 """Return the next CDD record from the handle. 61 62 Will return None if we ran out of records. 63 """ 64 data = self._reader.next() 65 66 if self._parser is not None: 67 if data: 68 dumpfile = open( 'dump', 'w' ) 69 dumpfile.write( data ) 70 dumpfile.close() 71 return self._parser.parse(File.StringHandle(data)) 72 73 return data
74
75 - def __iter__(self):
76 return iter(self.next, None)
77
78 -class _Scanner:
79 """Start up Martel to do the scanning of the file. 80 81 This initialzes the Martel based parser and connects it to a handler 82 that will generate events for a Feature Consumer. 83 """
84 - def __init__(self, debug_level = 0):
85 """Initialize the scanner by setting up our caches. 86 87 Creating the parser takes a long time, so we want to cache it 88 to reduce parsing time. 89 90 Arguments: 91 o debug - The level of debugging that the parser should 92 display. Level 0 is no debugging, Level 2 displays the most 93 debugging info (but is much slower). See Martel documentation 94 for more info on this. 95 """ 96 # a listing of all tags we are interested in scanning for 97 # in the MartelParser 98 self.interest_tags = [ "cd_tag", \ 99 "description_tag", \ 100 "status_tag", \ 101 "source_tag", \ 102 "date_tag", \ 103 "taxonomy_tag", \ 104 "aligned_tag", \ 105 "representative_tag", \ 106 "range_tag", \ 107 "sequence_tag", \ 108 "description_contents_multiline", \ 109 "status_contents_multiline", \ 110 "source_contents_multiline", \ 111 "date_contents_multiline", \ 112 "reference_contents_multiline", \ 113 "taxonomy_contents_multiline", \ 114 "aligned_contents_multiline", \ 115 "representative_contents_multiline", \ 116 "range_contents_multiline", \ 117 "cd_contents_multiline", \ 118 "sequence_contents_multiline", \ 119 "table_entry" ] 120 121 # make a parser that returns only the tags we are interested in 122 expression = Martel.select_names( cdd_format.cdd_record, self.interest_tags) 123 self._parser = expression.make_parser(debug_level )
124
125 - def feed(self, handle, consumer):
126 """Feeed a set of data into the scanner. 127 128 Arguments: 129 o handle - A handle with the information to parse. 130 o consumer - The consumer that should be informed of events. 131 """ 132 consumer.set_interest_tags( self.interest_tags ) 133 self._parser.setContentHandler( consumer ) 134 # self._parser.setErrorHandler(handle.ErrorHandler()) 135 136 self._parser.parseFile(handle)
137
138 -class _RecordConsumer( Dispatcher ):
139 """Create a CDD Record object from scanner generated information. 140 """
141 - def __init__(self):
142 Dispatcher.__init__( self ) 143 self.data = Record.Record() 144 self._pending_key = ''
145 146
147 - def set_interest_tags( self, interest_tags ):
148 self.interest_tags = interest_tags
149
150 - def start_cd_tag( self, line, attrs ):
151 self.save_characters()
152
153 - def end_cd_tag( self, cdd_record ):
154 key = self.save_key()
155
156 - def start_cd_contents_multiline( self, text, attrs ):
157 self.save_characters()
158
159 - def end_cd_contents_multiline( self, cdd_record ):
160 self.add_entry()
161
162 - def start_description_tag( self, text, attrs ):
163 self.save_characters()
164
165 - def end_description_tag( self, cdd_record ):
166 key = self.save_key()
167
168 - def start_description_contents_multiline( self, text, attrs ):
169 self.save_characters()
170
171 - def end_description_contents_multiline( self, cdd_record ):
172 self.add_entry()
173
174 - def start_status_tag( self, text, attrs ):
175 self.save_characters()
176
177 - def end_status_tag( self, cdd_record ):
178 key = self.save_key()
179
180 - def start_status_contents_multiline( self, text, attrs ):
181 self.save_characters()
182
183 - def end_status_contents_multiline( self, cdd_record ):
184 self.add_entry()
185
186 - def start_source_tag( self, text, attrs ):
187 self.save_characters()
188
189 - def end_source_tag( self, cdd_record ):
190 key = self.save_key()
191
192 - def start_source_contents_multiline( self, text, attrs ):
193 self.save_characters()
194
195 - def end_source_contents_multiline( self, cdd_record ):
196 self.add_entry()
197
198 - def start_date_tag( self, text, attrs ):
199 self.save_characters()
200
201 - def end_date_tag( self, cdd_record ):
202 key = self.save_key()
203
204 - def start_date_contents_multiline( self, text, attrs ):
205 self.save_characters()
206
207 - def end_date_contents_multiline( self, cdd_record ):
208 self.add_entry()
209
210 - def start_reference_contents_multiline( self, text, attrs ):
211 self.save_characters()
212
213 - def end_reference_contents_multiline( self, cdd_record ):
214 reference = self.get_characters() 215 self.data[ 'references' ].append( reference )
216
217 - def start_taxonomy_tag( self, text, attrs ):
218 self.save_characters()
219
220 - def end_taxonomy_tag( self, cdd_record ):
221 key = self.save_key()
222
223 - def start_taxonomy_contents_multiline( self, text, attrs ):
224 self.save_characters()
225
226 - def end_taxonomy_contents_multiline( self, cdd_record ):
227 self.add_entry()
228
229 - def start_aligned_tag( self, text, attrs ):
230 self.save_characters()
231
232 - def end_aligned_tag( self, cdd_record ):
233 key = self.save_key()
234
235 - def start_aligned_contents_multiline( self, text, attrs ):
236 self.save_characters()
237
238 - def end_aligned_contents_multiline( self, cdd_record ):
239 self.add_entry()
240
241 - def start_representative_tag( self, text, attrs ):
242 self.save_characters()
243
244 - def end_representative_tag( self, cdd_record ):
245 key = self.save_key()
246
247 - def start_representative_contents_multiline( self, text, attrs ):
248 self.save_characters()
249
250 - def end_representative_contents_multiline( self, cdd_record ):
251 self.add_entry()
252
253 - def start_range_tag( self, text, attrs ):
254 self.save_characters()
255
256 - def end_range_tag( self, cdd_record ):
257 key = self.save_key()
258
259 - def start_range_contents_multiline( self, text, attrs ):
260 self.save_characters()
261
262 - def end_range_contents_multiline( self, cdd_record ):
263 self.add_entry()
264
265 - def start_sequence_tag( self, text, attrs ):
266 self.save_characters()
267
268 - def end_sequence_tag( self, cdd_record ):
269 key = self.save_key()
270
271 - def start_sequence_contents_multiline( self, text, attrs ):
272 self.save_characters()
273
274 - def end_sequence_contents_multiline( self, cdd_record ):
275 line = self.get_characters() 276 ( lines ) = line.splitlines() 277 key = self._pending_key 278 val = '' 279 for line in lines: 280 line = line.strip() 281 val = val + line 282 self.data[ key ] = Seq( val )
283
284 - def start_table_entry( self, text, attrs ):
285 self.save_characters()
286
287 - def end_table_entry( self, cdd_record ):
288 line = self.get_characters() 289 ( lines ) = line.splitlines() 290 key = '' 291 val = '' 292 state = 'key' 293 for line in lines: 294 line = line.strip() 295 upper_line = line.upper() 296 if( upper_line.endswith( '[CD]' ) ): 297 line = line[ :-4 ] 298 state = 'val' 299 elif( len( line ) > 60 ): 300 state = 'val' 301 else: 302 state = 'key' 303 if( state == 'key' ): 304 key = key + line 305 else: 306 val = val + line 307 self.data[ 'alignment_lookup' ][ key ] = val
308
309 - def save_key( self ):
310 key = self.get_characters() 311 self._pending_key = key[ : -1 ]
312
313 - def add_entry( self ):
314 key = self._pending_key 315 self._pending_key = "" 316 self.data[ key ] = self.get_characters()
317
318 -class RecordParser:
319 """Parse CDD files into Record objects 320 """
321 - def __init__(self, debug_level = 0):
322 """Initialize the parser. 323 324 Arguments: 325 o debug_level - An optional argument that specifies the amount of 326 debugging information Martel should spit out. By default we have 327 no debugging info (the fastest way to do things), but if you want 328 you can set this as high as two and see exactly where a parse fails. 329 """ 330 self._scanner = _Scanner(debug_level)
331
332 - def parse(self, handle):
333 """Parse the specified handle into an NBRF record. 334 """ 335 self._consumer = _RecordConsumer() 336 self._scanner.feed(handle, self._consumer) 337 return self._consumer.data
338