Package Bio :: Package Ndb
[hide private]
[frames] | no frames]

Source Code for Package Bio.Ndb

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provided code to parse HTML files from NDB (DEPRECATED). 
  8   
  9  This module provides an HTML parser designed for the NDB website 
 10  http://ndbserver.rutgers.edu/ as it was circa 2002.  The site has since 
 11  been redesigned, breaking the parser.  Bio.Ndb is therefore deprecated, 
 12  and will be removed in a future release of Biopython. 
 13   
 14  Classes: 
 15  Record             Holds NDB sequence data. 
 16  NdbParser          Parses NDB sequence data into a Record object. 
 17   
 18  The algorithm is based on a state machine because the record has multiple  
 19  sections and the handling of tags varies depending on the section.   
 20  Citations have their own state machine. 
 21  """ 
 22  import warnings 
 23  warnings.warn("Bio.Ndb has been deprecated as the NDB website it used to"\ 
 24                " parse has been redesigned.", DeprecationWarning) 
 25   
 26  from types import * 
 27  from Bio import File 
 28  from Bio import Index 
 29  from Bio.Crystal import Hetero 
 30  from Bio.Crystal import Chain 
 31  from Bio.Crystal import Crystal 
 32  from Bio.SeqFeature import Reference 
 33  import urllib 
 34  import sgmllib 
 35  from Bio.ParserSupport import * 
 36  from Bio.SeqFeature import Reference 
 37   
 38   
39 -class Record( dict ):
40
41 - def __init__( self ):
42 self[ 'Id' ] = '' 43 self[ 'Features' ] = '' 44 self[ 'Name' ] = '' 45 self[ 'Sequence' ] = Crystal( {} ) 46 self[ 'Citation' ] = Reference() 47 self[ 'Space Group' ] = '' 48 self[ 'Cell Constants' ] = {} 49 self[ 'Crystallization Conditions' ] = [] 50 self[ 'Refinement' ] = '' 51 self[ 'Coordinates' ] = ''
52
53 - def __str__( self ):
54 keys = self.keys() 55 keys.sort() 56 out = '' 57 for key in keys: 58 val = self[ key ] 59 if( type( val ) == type( [] ) ): 60 out = out + '\n%s\n' % key 61 for item in val: 62 out = out + '%s\n' % item 63 64 elif( type( val ) == type( {} ) ): 65 out = out + '\n%s\n' % key 66 subkeys = val.keys() 67 subkeys.sort() 68 for item in subkeys: 69 out = out + '%s : %s\n' % ( item, val[ item ] ) 70 elif( isinstance( val, dict ) ): 71 out = out + '\n%s\n' % key 72 subkeys = val.keys() 73 subkeys.sort() 74 for item in subkeys: 75 out = out + '%s : %s\n' % ( item, val[ item ] ) 76 77 else: 78 out = out + '%s: %s\n' % ( key, self[ key ] ) 79 return out
80
81 -def _parse_constants( text ):
82 items = text.split( '=' ) 83 constants = {} 84 key = '' 85 for i in range( 0, ( len( items ) - 1 ) ): 86 item = items[ i ] 87 item = item.strip() 88 separator = item.rfind( ' ' ) 89 if( separator < 0 ): 90 separator = 0 91 val = item[ :separator ] 92 val = val.strip() 93 if( key != '' ): 94 constants[ key ] = val 95 key = item[ separator: ] 96 key = key.strip() 97 constants[ key ] = items[ -1 ] 98 return constants
99 100 101 102 103
104 -class NdbParser( sgmllib.SGMLParser ):
105 """Parses Ndb sequence data into a Record object. 106 data available at: http://ndbserver.rutgers.edu/NDB/NDBATLAS/index.html 107 """
108 - def reset(self):
109 sgmllib.SGMLParser.reset( self ) 110 self.ndb_dict = Record() 111 self.text = '' 112 self._space_group = '' 113 self._state = 'id' 114 self._reference_state = 'authors' 115 self._current_reference = Reference()
116
117 - def parse(self, handle):
118 self.reset() 119 self.feed(handle) 120 return self.ndb_dict
121
122 - def feed(self, handle):
123 """feed(self, handle ) 124 125 Feed in ndb data for scanning. handle is a file-like object 126 containing ndb data. consumer is a Consumer object that will 127 receive events as the ndb data is scanned. 128 129 """ 130 if isinstance(handle, File.UndoHandle): 131 uhandle = handle 132 else: 133 uhandle = File.UndoHandle(handle) 134 text = '' 135 while 1: 136 line = uhandle.readline() 137 if( not line ): 138 break 139 line = line.strip() 140 if( line[ -7: ] == '</HTML>' ): 141 break 142 text = text + ' ' + line 143 144 sgmllib.SGMLParser.feed( self, text )
145 146
147 - def handle_data(self, newtext ):
148 newtext = newtext.strip() 149 self.text = self.text + newtext
150
151 - def start_h1( self, attrs ):
152 self._flush_text()
153
154 - def end_h1( self ):
155 text = self._flush_text() 156 if( self._state == 'id' ): 157 cols = text.split( ':' ) 158 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper() 159 self._state = 'id_found'
160
161 - def start_h2( self, attrs ):
162 text = self._flush_text() 163 if( self._state == 'features' ): 164 self.ndb_dict[ 'Features' ] = text 165 elif( self._state == 'name' ): 166 self.ndb_dict[ 'Name' ] = text 167 elif( self._state == 'sequence' ): 168 pass 169 elif( self._state == 'citation' ): 170 if( self._reference_state == 'journal' ): 171 self._current_reference.journal = text 172 self.ndb_dict[ 'Citation' ] = self._current_reference 173 elif( self._state == 'space' ): 174 self._space_group = self._space_group + text 175 self.ndb_dict[ 'Space Group' ] = self._space_group 176 elif( self._state == 'constants' ): 177 self.ndb_dict[ 'Cell Constants' ] = _parse_constants( text ) 178 elif( self._state == 'crystallization' ): 179 pass 180 elif( self._state == 'refinement' ): 181 self.ndb_dict[ 'Refinement' ] = text 182 elif( self._state == 'coordinates' ): 183 self.ndb_dict[ 'Coordinates' ] = text
184
185 - def end_h2( self ):
186 text = self._flush_text() 187 text = text.lower() 188 if( self._state == 'id' ): 189 if( text.find( 'id' ) >= 0 ): 190 cols = text.split( ':' ) 191 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper() 192 self._state = 'id_found' 193 elif( text.find( 'feature' ) >= 0 ): 194 self._state = 'features' 195 elif( text.find( 'name' ) >= 0 ): 196 self._state = 'name' 197 elif( text.find( 'sequence' ) >= 0 ): 198 self._state = 'sequence' 199 elif( text.find( 'citation' ) >= 0 ): 200 self._state = 'citation' 201 elif( text.find( 'space' ) >= 0 ): 202 self._state = 'space' 203 elif( text.find( 'constants' ) >= 0 ): 204 self._state = 'constants' 205 elif( text.find( 'crystallization' ) >= 0 ): 206 self._state = 'crystallization' 207 elif( text.find( 'refinement' ) >= 0 ): 208 self._state = 'refinement' 209 elif( text.find( 'coordinates' ) >= 0 ): 210 self._state = 'coordinates'
211 212
213 - def start_ul( self, attrs ):
214 if( self._state == 'sequence' ): 215 self._flush_text() 216 217 elif( self._state == 'crystallization' ): 218 self._flush_text()
219
220 - def end_ul( self ):
221 if( self._state == 'sequence' ): 222 self._parse_chain() 223 elif( self._state == 'crystallization' ): 224 text = self._flush_text() 225 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text ) 226 elif( self._state == 'citation' ): 227 if( self._reference_state == 'journal' ): 228 self._current_reference.journal = self._flush_text() 229 self._reference_state = 'done'
230
231 - def start_sub( self, attrs ):
232 if( self._state == 'space' ): 233 self._space_group = self._space_group + self._flush_text()
234
235 - def end_sub( self ):
236 if( self._state == 'space' ): 237 self._space_group = self._space_group + '(%s) ' % self._flush_text()
238
239 - def start_li( self, attrs ):
240 if( self._state == 'sequence' ): 241 self._parse_chain() 242 elif( self._state == 'crystallization' ): 243 text = self._flush_text() 244 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
245
246 - def end_li( self ):
247 if( self._state == 'sequence' ): 248 self._parse_chain() 249 elif( self._state == 'crystallization' ): 250 text = self._flush_text() 251 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
252
253 - def do_br( self, attrs ):
254 if( self._state == 'citation' ): 255 if( self._reference_state == 'authors' ): 256 self._current_reference.authors = self._flush_text() 257 self._reference_state = 'title' 258 elif( self._reference_state == 'title' ): 259 self._current_reference.title = self._flush_text() 260 self._reference_state = 'journal'
261
262 - def start_i( self, attrs ):
263 pass
264
265 - def end_i( self ):
266 if( self._state == 'references' ): 267 if( self._reference_state == 'title' ): 268 text = self._flush_text() 269 self._current_reference.title = text 270 self._reference_state = 'journal'
271 272
273 - def _parse_chain( self ):
274 text = self._flush_text() 275 text = text.strip() 276 if( text.lower().startswith( 'chain' ) ): 277 fields = text.split( ':' ) 278 words = fields[ 0 ].split() 279 key = words[ 1 ] 280 val = fields[ 1 ] 281 self.ndb_dict[ 'Sequence' ][ key ] = val
282 283 284
285 - def _flush_text( self ):
286 text = self.text.strip() 287 self.text = '' 288 return text[:]
289 290 291 if( __name__ == '__main__' ): 292 handle = open( 'PR0004.htm') 293 undo_handle = File.UndoHandle( handle ) 294 ndb_parser = NdbParser() 295 record = ndb_parser.parse( handle ) 296 print str( record ) 297