Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # Version 2.0 
  8  # 
  9  # (c) 2003 Kristian Rother 
 10  # This work was supported by the German Ministry of Education 
 11  # and Research (BMBF). Project http://www.bcbio.de 
 12  #  
 13  # Contact the author 
 14  #    homepage : http://www.rubor.de/bioinf 
 15  #    email    : krother@genesilico.pl 
 16  # 
 17  # 
 18  # This Code is released under the conditions of the Biopython license. 
 19  # It may be distributed freely with respect to the original author. 
 20  # Any maintainer of the BioPython code may change this notice 
 21  # when appropriate. 
 22  # 
 23  # Last modified on Fri, Oct 24th 2006, Warszawa 
 24  # 
 25  # Removed 'write' options from retrieve_pdb_file method: it is not used. 
 26  # Also added a 'dir' options (pdb file is put in this directory if given), 
 27  # and an 'exist' option (test if the file is already there). This method 
 28  # now returns the name of the downloaded uncompressed file. 
 29  # 
 30  # -Thomas, 1/06/04 
 31  # 
 32  # 
 33  # Including bugfixes from Sunjoong Lee (9/2006) 
 34  # 
 35   
 36  __doc__="Access the PDB over the internet (for example to download structures)." 
 37   
 38  #TODO - Use os.path.join(...) instead of adding strings with os.sep 
 39  import urllib, re, os 
 40  import warnings 
 41  import shutil 
 42   
43 -class PDBList:
44 """ 45 This class provides quick access to the structure lists on the 46 PDB server or its mirrors. The structure lists contain 47 four-letter PDB codes, indicating that structures are 48 new, have been modified or are obsolete. The lists are released 49 on a weekly basis. 50 51 It also provides a function to retrieve PDB files from the server. 52 To use it properly, prepare a directory /pdb or the like, 53 where PDB files are stored. 54 55 If You want to use this module from inside a proxy, add 56 the proxy variable to Your environment, e.g. in Unix 57 export HTTP_PROXY='http://realproxy.charite.de:888' 58 (This can also be added to ~/.bashrc) 59 """ 60 61 PDB_REF=""" 62 The Protein Data Bank: a computer-based archival file for macromolecular structures. 63 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 64 J. Mol. Biol. 112 pp. 535-542 (1977) 65 http://www.pdb.org/. 66 """ 67 68 alternative_download_url = "http://www.rcsb.org/pdb/files/" 69 # just append PDB code to this, and then it works. 70 # (above URL verified with a XXXX.pdb appended on 2 Sept 2008) 71
72 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
73 """Initialize the class with the default server or a custom one.""" 74 # remote pdb server 75 self.pdb_server = server 76 77 # local pdb file tree 78 self.local_pdb = pdb 79 80 # local file tree for obsolete pdb files 81 if obsolete_pdb: 82 self.obsolete_pdb = obsolete_pdb 83 else: 84 self.obsolete_pdb = self.local_pdb + os.sep + 'obsolete' 85 if not os.access(self.obsolete_pdb,os.F_OK): 86 os.makedirs(self.obsolete_pdb) 87 88 # variables for command-line options 89 self.overwrite = 0 90 self.flat_tree = 0
91 92
93 - def get_status_list(self,url):
94 """Retrieves a list of pdb codes in the weekly pdb status file 95 from the given URL. Used by get_recent_files. 96 97 Typical contents of the list files parsed by this method; 98 -rw-r--r-- 1 rcsb rcsb 330156 Oct 14 2003 pdb1cyq.ent 99 -rw-r--r-- 1 rcsb rcsb 333639 Oct 14 2003 pdb1cz0.ent 100 """ 101 url = urllib.urlopen(url) 102 file = url.readlines() 103 list = [] 104 105 # added by S. Lee 106 list = map(lambda x: x[3:7], \ 107 filter(lambda x: x[-4:] == '.ent', \ 108 map(lambda x: x.split()[-1], file))) 109 return list
110 111
112 - def get_recent_changes(self):
113 """Returns three lists of the newest weekly files (added,mod,obsolete). 114 115 Reads the directories with changed entries from the PDB server and 116 returns a tuple of three URL's to the files of new, modified and 117 obsolete entries from the most recent list. The directory with the 118 largest numerical name is used. 119 Returns None if something goes wrong. 120 121 Contents of the data/status dir (20031013 would be used); 122 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 123 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 124 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 125 126 127 """ 128 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/') 129 file = url.readlines() 130 131 try: 132 # added by S.Lee 133 recent = filter(lambda x: x.isdigit(), \ 134 map(lambda x: x.split()[-1], file))[-1] 135 136 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent) 137 # retrieve the lists 138 added = self.get_status_list(path+'added.pdb') 139 modified = self.get_status_list(path+'modified.pdb') 140 obsolete = self.get_status_list(path+'obsolete.pdb') 141 return [added,modified,obsolete] 142 except: 143 return None
144 145 146
147 - def get_all_entries(self):
148 """Retrieves a big file containing all the 149 PDB entries and some annotation to them. 150 Returns a list of PDB codes in the index file. 151 """ 152 entries = [] 153 warnings.warn("retrieving index file. Takes about 5 MB.") 154 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx') 155 # extract four-letter-codes 156 entries = map(lambda x: x[:4], \ 157 filter(lambda x: len(x)>4, url.readlines()[2:])) 158 159 return entries
160 161 162
163 - def get_all_obsolete(self):
164 """Returns a list of all obsolete entries ever in the PDB. 165 166 Returns a list of all obsolete pdb codes that have ever been 167 in the PDB. 168 169 Gets and parses the file from the PDB server in the format 170 (the first pdb_code column is the one used). 171 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 172 OBSLTE 30-SEP-03 1Q1D 1QZR 173 OBSLTE 26-SEP-03 1DYV 1UN2 174 """ 175 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat') 176 # extract pdb codes 177 obsolete = map(lambda x: x[21:25].lower(), 178 filter(lambda x: x[:6] == 'OBSLTE', url.readlines())) 179 180 return obsolete
181 182 183
184 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.gz', 185 uncompress="gunzip", pdir=None):
186 """Retrieves a PDB structure file from the PDB server and 187 stores it in a local file tree. 188 The PDB structure is returned as a single string. 189 If obsolete is 1, the file will be by default saved in a special file tree. 190 The compression should be '.Z' or '.gz'. 'uncompress' is 191 the command called to uncompress the files. 192 193 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 194 @type pdir: string 195 196 @return: filename 197 @rtype: string 198 """ 199 # get the structure 200 code=pdb_code.lower() 201 filename="pdb%s.ent%s"%(code,compression) 202 if not obsolete: 203 url=(self.pdb_server+ 204 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s' 205 % (code[1:3],code,compression)) 206 else: 207 url=(self.pdb_server+ 208 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s' 209 % (code[1:3],code,compression)) 210 211 # in which dir to put the pdb file? 212 if pdir is None: 213 if self.flat_tree: 214 if not obsolete: 215 path=self.local_pdb 216 else: 217 path=self.obsolete_pdb 218 else: 219 # Put in PDB style directory tree 220 if not obsolete: 221 path=self.local_pdb+os.sep+code[1:3] 222 else: 223 path=self.obsolete_pdb+os.sep+code[1:3] 224 else: 225 # Put in specified directory 226 path=pdir 227 228 if not os.access(path,os.F_OK): 229 os.makedirs(path) 230 231 filename=path+os.sep+filename 232 # the final uncompressed file 233 final_file=path+os.sep+"pdb%s.ent" % code 234 235 # check whether the file exists 236 if not self.overwrite: 237 if os.path.exists(final_file): 238 warnings.warn("file exists, not retrieved %s" % final_file, 239 RuntimeWarning) 240 return final_file 241 242 # Retrieve the file 243 warnings.warn('retrieving %s' % url) 244 lines=urllib.urlopen(url).read() 245 open(filename,'wb').write(lines) 246 # uncompress the file 247 os.system("%s %s" % (uncompress, filename)) 248 249 return final_file
250 251
252 - def update_pdb(self):
253 """ 254 I guess this is the 'most wanted' function from this module. 255 It gets the weekly lists of new and modified pdb entries and 256 automatically downloads the according PDB files. 257 You can call this module as a weekly cronjob. 258 """ 259 changes = self.get_recent_changes() 260 new = changes[0] 261 modified = changes[1] 262 obsolete = changes[2] 263 264 for pdb_code in new+modified: 265 try: 266 warnings.warn('retrieving %s' % pdb_code) 267 self.retrieve_pdb_file(pdb_code) 268 except: 269 warnings.warn('error %s' % pdb_code, RuntimeWarning) 270 # you can insert here some more log notes that 271 # something has gone wrong. 272 273 # move the obsolete files to a special folder 274 for pdb_code in obsolete: 275 if self.flat_tree: 276 old_file = self.local_pdb + os.sep + 'pdb%s.ent'%(pdb_code) 277 new_file = self.obsolete_pdb + os.sep + 'pdb%s.ent'%(pdb_code) 278 else: 279 old_file = self.local_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code) 280 new_file = self.obsolete_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code) 281 shutil.move(old_file, new_file)
282 283
284 - def download_entire_pdb(self,listfile=None):
285 """Retrieves all PDB entries not present in the local PDB copy. 286 Writes a list file containing all PDB codes (optional, if listfile is given). 287 """ 288 entries = self.get_all_entries() 289 for pdb_code in entries: self.retrieve_pdb_file(pdb_code) 290 291 # write the list 292 if listfile: 293 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
294 295
296 - def download_obsolete_entries(self,listfile=None):
297 298 """Retrieves all obsolete PDB entries not present in the local obsolete PDB copy. 299 Writes a list file containing all PDB codes (optional, if listfile is given). 300 """ 301 entries = self.get_all_obsolete() 302 for pdb_code in entries: self.retrieve_pdb_file(pdb_code,obsolete=1) 303 304 # write the list 305 if listfile: 306 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
307 308 309 310 # 311 # this is actually easter egg code not used by any of the methods 312 # maybe someone will find it useful. 313 #
314 - def get_seqres_file(self,savefile='pdb_seqres.txt'):
315 """Retrieves a (big) file containing all the sequences 316 of PDB entries and writes it to a file.""" 317 warnings.warn("retrieving sequence file. Takes about 15 MB.") 318 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/pdb_seqres.txt') 319 file = url.readlines() 320 open(savefile,'w').writelines(file)
321 322 323 324 if __name__ == '__main__': 325 326 import sys 327 328 doc = """PDBList.py 329 (c) Kristian Rother 2003, Contributed to BioPython 330 331 Usage: 332 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 333 local pdb tree. 334 PDBList.py all <pdb_path> [options] - write all PDB entries to 335 local pdb tree. 336 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 337 entries to local pdb tree. 338 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 339 340 Options: 341 -d A single directory will be used as <pdb_path>, not a tree. 342 -o Overwrite existing structure files. 343 """ 344 print doc 345 346 if len(sys.argv)>2: 347 pdb_path = sys.argv[2] 348 pl = PDBList(pdb=pdb_path) 349 if len(sys.argv)>3: 350 for option in sys.argv[3:]: 351 if option == '-d': pl.flat_tree = 1 352 elif option == '-o': pl.overwrite = 1 353 354 else: 355 pdb_path = os.getcwd() 356 pl = PDBList() 357 pl.flat_tree = 1 358 359 if len(sys.argv) > 1: 360 if sys.argv[1] == 'update': 361 # update PDB 362 print "updating local PDB at "+pdb_path 363 pl.update_pdb() 364 365 elif sys.argv[1] == 'all': 366 # get the entire PDB 367 pl.download_entire_pdb() 368 369 elif sys.argv[1] == 'obsol': 370 # get all obsolete entries 371 pl.download_obsolete_entries(pdb_path) 372 373 elif re.search('^\d...$',sys.argv[1]): 374 # get single PDB entry 375 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path) 376