1
2
3
4
5
6 """Code for dealing with lists of URLs (DEPRECATED).
7
8 This module is now deprecated, and will be removed in a future release of
9 Biopython.
10
11 NetCatch enables the user to scan a list of labelled urls and select
12 a subset to read into a file.
13
14 Functions:
15 get_urls_by_label
16 get_urls_by_index
17 get_urls_by_range
18 select_output_file
19 """
20
21 import warnings
22 warnings.warn("Bio.NetCatch is deprecated, and will be removed in a future"\
23 " release of Biopython. If you want to continue to use this"\
24 " code, please get in contact with the Biopython developers"\
25 " via the mailing lists to avoid its permanent removal from"\
26 " Biopython.", DeprecationWarning)
27 import os
28 import urllib
29 import sgmllib
30 from Bio import File
31
32
34 ( url_type, url ) = urllib.splittype( candidate )
35 if( url_type == None ):
36 return 0
37 ( url_host, url ) = urllib.splithost( url )
38 if( url_host == None ):
39 return 0
40 return 1
41
42 """
43 ExtractUrls.py
44
45
46 Scans a file in http format and builds a dictionary of urls
47 """
48
50
54
56 sgmllib.SGMLParser.reset( self )
57 self.urls = {}
58 self._inlink = 0
59 self._pending_url = ''
60 self.text = ''
61
63 output = ''
64 for key in self.urls.keys():
65 val = self.urls[ key ]
66 output = output + '%s : %s\n' % ( key, val )
67 return output
68
72
86
90
92 self._inlink = 1
93 for key, val in attrs:
94 if key.lower() == 'href':
95 self._pending_url = val
96
98 self._inlink = 0
99 key = self.text
100 self.text = ''
101 if not key == '':
102 key = key.replace( ' ', '_' )
103 self.urls[ key ] = self._pending_url
104
112
114 """
115 Decorator for a dictionary of links. Each link is indexed by its label.
116 Allows the user to select links of interest and read each selection into
117 its own file. The filename is contructed by appending the label with an
118 extension of html.
119
120 Files can be selected by index, range or label. The destination directory
121 defaults to the current directory. The user can specify another
122 dictionary by passing a list of path segments to the constructor.
123
124 net_catch = NetCatch()
125 net_catch = NetCatch( [ 'amylase', 'species' ] )
126 net_catch.get_all_urls()
127 net_catch.get_urls_by_label( [ 'pig', 'dog', 'cow' ] )
128 net_catch.get_urls_by_index( [ 1, 4, 6, 9 ] )
129 net_catch.get_urls_by_range( 2, 5 )
130 """
131
132 - def __init__( self, path_segments = [] ):
133 self._urls = {}
134 self._labels = []
135 assert type( path_segments ) == type( [] )
136 self.path_segments = path_segments
137 self._build_path()
138
140 base_path = os.path.join( '' )
141 for segment in self.path_segments:
142 base_path = os.path.join( base_path, segment )
143 self.base_path = base_path
144
146 i = 0
147 output = ''
148 for label in self._labels:
149 output = output + '%d %s: %s\n' % ( i, label, self._urls[ label ] )
150 i = i + 1
151 return output
152
156
162
164 url_opener = urllib.URLopener()
165 i = 0
166 for label in self._labels:
167 base_path = self.base_path
168 name = '%s%d.htm' % ( label, i )
169 full_path = os.path.join( base_path, name )
170 out_handle = open( full_path , "wb" )
171 i = i + 1
172 url = self._urls[ label ]
173 url_handle = url_opener.open( url )
174 contents = url_handle.read()
175 out_handle.write( contents )
176 url_opener.close( )
177 out_handle.close()
178
180 url_opener = urllib.URLopener()
181 for label in labels:
182 base_path = self.base_path
183 name = '%s.htm' % ( label )
184 full_path = os.path.join( base_path, name )
185 out_handle = open( full_path , "wb" )
186 url = self._urls[ label ]
187 url_handle = url_opener.open( url )
188 contents = url_handle.read()
189 out_handle.write( contents )
190 url_opener.close( )
191 out_handle.close( )
192
194 url_opener = urllib.URLopener()
195 for index in indices:
196 base_path = self.base_path
197 name = '%s.htm' % self._labels[ index ]
198 full_path = os.path.join( base_path, name )
199 out_handle = open( full_path , "wb" )
200 label = self._labels[ index ]
201 url = self._urls[ label ]
202 url_handle = url_opener.open( url )
203 contents = url_handle.read()
204 out_handle.write( contents )
205 url_opener.close( )
206 out_handle.close( )
207
209 url_opener = urllib.URLopener( )
210 for index in range( low, hi ):
211 base_path = self.base_path
212 name = '%s.htm' % self._labels[ index ]
213 full_path = os.path.join( base_path, name )
214 out_handle = open( full_path , "wb" )
215 label = self._labels[ index ]
216 url = self._urls[ label ]
217 url_handle = url_opener.open( url )
218 contents = url_handle.read()
219 out_handle.write( contents )
220 url_opener.close( )
221 out_handle.close( )
222