Package Bio :: Package InterPro
[hide private]
[frames] | no frames]

Source Code for Package Bio.InterPro

  1  # Copyright 2001 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with html files from InterPro, 
  8  and code to access resources at InterPro over the WWW. 
  9  http://www.ebi.ac.uk/interpro/ 
 10   
 11   
 12  Classes: 
 13  Record             Holds interpro sequence data. 
 14  InterProParser     Parses interpro sequence data into a Record object. 
 15   
 16  Functions: 
 17  get_interpro_entry 
 18   
 19  """ 
 20   
 21  from Bio import File 
 22  import sgmllib 
 23  from Bio.SeqFeature import Reference 
 24   
25 -class Record( dict ):
26
27 - def __str__( self ):
28 keys = self.keys() 29 keys.sort() 30 out = '' 31 for key in keys: 32 val = self[ key ] 33 if key == 'References': 34 out = out + '\n%s\n' % key 35 for reference in val: 36 out = out + '%s\n' % str( reference ) 37 out = out + '\n' 38 elif key == 'Examples': 39 out = out + '\n%s\n' % key 40 for example in val: 41 out = out + '%s\n' % example 42 elif key == 'Abstract': 43 out = out + '\n%s\n' % key 44 out = out + '%s...\n' % val[ : 80 ] 45 elif type( self[ key ] ) == list: 46 out = out + '\n%s\n' % key 47 for item in val: 48 out = out + '%s\n' % item 49 50 else: 51 out = out + '%s: %s\n' % ( key, self[ key ] ) 52 return out
53
54 -class InterProParser( sgmllib.SGMLParser ):
55 """Parses InterPro sequence data into a Record object. 56 57 """
58 - def reset(self):
59 sgmllib.SGMLParser.reset( self ) 60 self.text = '' 61 self.inter_pro_dict = Record() 62 self.inter_pro_dict[ 'Database' ] = '' 63 self.inter_pro_dict[ 'Accession' ] = '' 64 self.inter_pro_dict[ 'Name' ] = '' 65 self.inter_pro_dict[ 'Dates' ] = '' 66 self.inter_pro_dict[ 'Type' ] = '' 67 self.inter_pro_dict[ 'Parent' ] = '' 68 self.inter_pro_dict[ 'Process' ] = '' 69 self.inter_pro_dict[ 'Function' ] = '' 70 self.inter_pro_dict[ 'Component' ] = '' 71 self.inter_pro_dict[ 'Signatures' ] = [] 72 self.inter_pro_dict[ 'Abstract' ] = '' 73 self.inter_pro_dict[ 'Examples' ] = [] 74 self.inter_pro_dict[ 'References' ] = [] 75 self.inter_pro_dict[ 'Database links' ] = [] 76 self._state = 'title' 77 self._reference_state = '' 78 self._key_waiting = '' 79 self._current_reference = ''
80
81 - def parse(self, handle):
82 self.reset() 83 self.feed(handle) 84 return self.inter_pro_dict
85
86 - def feed(self, handle):
87 """feed(self, handle ) 88 89 Feed in interpro data for scanning. handle is a file-like object 90 containing interpro data. consumer is a Consumer object that will 91 receive events as the ndb data is scanned. 92 93 """ 94 if isinstance(handle, File.UndoHandle): 95 uhandle = handle 96 else: 97 uhandle = File.UndoHandle(handle) 98 text = '' 99 while 1: 100 line = uhandle.readline() 101 if not line: 102 break 103 line = line.strip() 104 if line[ -7: ] == '</HTML>': 105 break 106 text = text + ' ' + line 107 108 sgmllib.SGMLParser.feed( self, text )
109 110
111 - def handle_data(self, newtext ):
112 newtext = newtext.strip() 113 self.text = self.text + newtext
114
115 - def start_table( self, attrs ):
116 dictionary = dict( attrs ) 117 for key in dictionary: 118 val = dictionary[key]
119
120 - def start_h2( self, attrs ):
121 pass
122
123 - def end_h2( self ):
124 self._state = 'chugging_along'
125
126 - def start_td( self, attrs ):
127 dictionary = dict( attrs ) 128 if self._state == 'chugging_along': 129 if dictionary.has_key( 'class' ): 130 if dictionary[ 'class' ] == 'tag': 131 self._state = 'waiting_tag' 132 self._flush_text() 133 elif dictionary[ 'class' ] == 'inf': 134 self._state = 'waiting_inf' 135 self._flush_text()
136
137 - def end_td( self ):
138 if self._state == 'waiting_tag': 139 self._key_waiting = self._flush_text() 140 self._state = 'chugging_along' 141 elif self._state == 'waiting_inf': 142 key = self._key_waiting 143 if self.inter_pro_dict.has_key( key ): 144 val = self._flush_text() 145 if key == 'Signatures': 146 pass 147 elif key == 'Database links': 148 pass 149 else: 150 self.inter_pro_dict[ key ] = val 151 self._key_waiting = '' 152 self._state = 'chugging_along'
153 154
155 - def start_ul( self, attrs ):
156 if self._key_waiting == 'Examples': 157 self._state = 'examples' 158 self._flush_text()
159
160 - def end_ul( self ):
161 self._key_waiting = '' 162 self._state = 'chugging_along'
163
164 - def start_ol( self, attrs ):
165 if self._key_waiting == 'References': 166 self._state = 'references' 167 self._reference_state = 'pubmed_id' 168 self._flush_text() 169 self._references = []
170
171 - def end_ol( self ):
172 if self._state == 'references': 173 self._references.append( self._current_reference ) 174 self.inter_pro_dict[ 'References' ] = self._references 175 self._state = 'chugging_along'
176
177 - def start_li( self, attrs ):
178 if self._state == 'references': 179 self._reference_state = 'pubmed_id' 180 self._flush_text() 181 if( self._current_reference != '' ): 182 self._references.append( self._current_reference ) 183 self._current_reference = Reference()
184
185 - def end_li( self ):
186 if self._state == 'examples': 187 text = self._flush_text() 188 self.inter_pro_dict[ 'Examples' ].append( text )
189
190 - def start_a( self, attrs ):
191 dictionary = dict( attrs ) 192 if self._state == 'references': 193 if self._reference_state == 'pubmed_id': 194 if dictionary.has_key( 'name' ): 195 self._current_reference.pubmed_id = dictionary[ 'name' ] 196 self._reference_state = 'authors' 197 elif self._reference_state == 'journal': 198 self._current_reference.journal = self._flush_text() 199 self._reference_state = 'medline_id'
200
201 - def end_a( self ):
202 if self._state == 'references': 203 if self._reference_state == 'medline_id': 204 text = self._flush_text() 205 cols = text.split( ':' ) 206 try: 207 medline_id = cols[ 1 ] 208 except IndexError: 209 medline_id = None 210 else: 211 medline_id = medline_id[ : -1 ] 212 self._current_reference.medline_id = medline_id
213
214 - def do_br( self, attrs ):
215 if self._state == 'references': 216 if self._reference_state == 'authors': 217 self._current_reference.authors = self._flush_text() 218 self._reference_state = 'title' 219 elif self._key_waiting == 'Signatures': 220 self.inter_pro_dict[ 'Signatures' ].append( self._flush_text() ) 221 elif self._key_waiting == 'Database links': 222 self.inter_pro_dict[ 'Database links' ].append( self._flush_text() )
223
224 - def start_i( self, attrs ):
225 pass
226
227 - def end_i( self ):
228 if self._state == 'references': 229 if self._reference_state == 'title': 230 text = self._flush_text() 231 self._current_reference.title = text 232 self._reference_state = 'journal'
233 234
235 - def handle_starttag(self, tag, method, attrs):
236 if self._state == 'references': 237 if tag == 'li': 238 self.stack.pop() 239 elif tag == 'a': 240 if self._reference_state == 'pubmed_id': 241 self.stack.pop() 242 method(attrs)
243 244
245 - def _flush_text( self ):
246 text = self.text.strip() 247 self.text = '' 248 return text[:]
249
250 -def pairlist_to_dict( pairs ):
251 import warnings 252 warnings.warn("pairlist_to_dict was deprecated. Please use dict() instead of pairlist_to_dict") 253 return dict(pairs)
254
255 -def get_interpro_entry( id ):
256 """get specified interpro entry""" 257 import urllib 258 handle = urllib.urlopen("http://www.ebi.ac.uk/interpro/IEntry?ac=" + id ) 259 260 # XXX need to check to see if the entry exists! 261 return handle
262 263 if __name__ == '__main__': 264 import Bio.File 265 handle = open('IPR001064.htm') 266 undo_handle = Bio.File.UndoHandle( handle ) 267 interpro_parser = InterProParser() 268 record = interpro_parser.parse( handle ) 269 print str( record ) 270