Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # Version 2.0 
  8  # 
  9  # (c) 2003 Kristian Rother 
 10  # This work was supported by the German Ministry of Education 
 11  # and Research (BMBF). Project http://www.bcbio.de 
 12  #  
 13  # Contact the author 
 14  #    homepage : http://www.rubor.de/bioinf 
 15  #    email    : krother@genesilico.pl 
 16  # 
 17  # 
 18  # This Code is released under the conditions of the Biopython license. 
 19  # It may be distributed freely with respect to the original author. 
 20  # Any maintainer of the BioPython code may change this notice 
 21  # when appropriate. 
 22   
 23  """Access the PDB over the internet (for example to download structures).""" 
 24   
 25  import gzip 
 26  import os 
 27  import shutil 
 28  from urllib2 import urlopen as _urlopen 
 29  import warnings 
 30   
 31  from Bio import BiopythonDeprecationWarning 
 32   
 33   
34 -class PDBList(object):
35 """ 36 This class provides quick access to the structure lists on the 37 PDB server or its mirrors. The structure lists contain 38 four-letter PDB codes, indicating that structures are 39 new, have been modified or are obsolete. The lists are released 40 on a weekly basis. 41 42 It also provides a function to retrieve PDB files from the server. 43 To use it properly, prepare a directory /pdb or the like, 44 where PDB files are stored. 45 46 If You want to use this module from inside a proxy, add 47 the proxy variable to Your environment, e.g. in Unix 48 export HTTP_PROXY='http://realproxy.charite.de:888' 49 (This can also be added to ~/.bashrc) 50 """ 51 52 PDB_REF=""" 53 The Protein Data Bank: a computer-based archival file for macromolecular structures. 54 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 55 J. Mol. Biol. 112 pp. 535-542 (1977) 56 http://www.pdb.org/. 57 """ 58 59 alternative_download_url = "http://www.rcsb.org/pdb/files/" 60 # just append PDB code to this, and then it works. 61
62 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
63 """Initialize the class with the default server or a custom one.""" 64 # remote pdb server 65 self.pdb_server = server 66 67 # local pdb file tree 68 self.local_pdb = pdb 69 70 # local file tree for obsolete pdb files 71 if obsolete_pdb: 72 self.obsolete_pdb = obsolete_pdb 73 else: 74 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 75 if not os.access(self.obsolete_pdb,os.F_OK): 76 os.makedirs(self.obsolete_pdb) 77 78 # variables for command-line options 79 self.overwrite = 0 80 self.flat_tree = 0
81 82
83 - def get_status_list(self,url):
84 """Retrieves a list of pdb codes in the weekly pdb status file 85 from the given URL. Used by get_recent_files. 86 87 Typical contents of the list files parsed by this method is now 88 very simply one PDB name per line. 89 """ 90 handle = _urlopen(url) 91 answer = [] 92 for line in handle: 93 pdb = line.strip() 94 assert len(pdb)==4 95 answer.append(pdb) 96 handle.close() 97 return answer
98 99
100 - def get_recent_changes(self):
101 """Returns three lists of the newest weekly files (added,mod,obsolete). 102 103 Reads the directories with changed entries from the PDB server and 104 returns a tuple of three URL's to the files of new, modified and 105 obsolete entries from the most recent list. The directory with the 106 largest numerical name is used. 107 Returns None if something goes wrong. 108 109 Contents of the data/status dir (20031013 would be used); 110 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 111 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 112 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 113 """ 114 url = _urlopen(self.pdb_server + '/pub/pdb/data/status/') 115 recent = filter(str.isdigit, 116 (x.split()[-1] for x in url.readlines()) 117 )[-1] 118 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent) 119 # Retrieve the lists 120 added = self.get_status_list(path+'added.pdb') 121 modified = self.get_status_list(path+'modified.pdb') 122 obsolete = self.get_status_list(path+'obsolete.pdb') 123 return [added,modified,obsolete]
124
125 - def get_all_entries(self):
126 """Retrieves a big file containing all the 127 PDB entries and some annotation to them. 128 Returns a list of PDB codes in the index file. 129 """ 130 print "retrieving index file. Takes about 5 MB." 131 url = _urlopen(self.pdb_server + 132 '/pub/pdb/derived_data/index/entries.idx') 133 return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
134
135 - def get_all_obsolete(self):
136 """Returns a list of all obsolete entries ever in the PDB. 137 138 Returns a list of all obsolete pdb codes that have ever been 139 in the PDB. 140 141 Gets and parses the file from the PDB server in the format 142 (the first pdb_code column is the one used). The file looks 143 like this: 144 145 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 146 OBSLTE 31-JUL-94 116L 216L 147 ... 148 OBSLTE 29-JAN-96 1HFT 2HFT 149 OBSLTE 21-SEP-06 1HFV 2J5X 150 OBSLTE 21-NOV-03 1HG6 151 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 152 OBSLTE 08-NOV-96 1HID 2HID 153 OBSLTE 01-APR-97 1HIU 2HIU 154 OBSLTE 14-JAN-04 1HKE 1UUZ 155 ... 156 157 """ 158 handle = _urlopen(self.pdb_server + 159 '/pub/pdb/data/status/obsolete.dat') 160 # Extract pdb codes. Could use a list comprehension, but I want 161 # to include an assert to check for mis-reading the data. 162 obsolete = [] 163 for line in handle: 164 if not line.startswith("OBSLTE ") : continue 165 pdb = line.split()[2] 166 assert len(pdb)==4 167 obsolete.append(pdb) 168 handle.close() 169 return obsolete
170
171 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression=None, 172 uncompress=None, pdir=None):
173 """ Retrieves a PDB structure file from the PDB server and 174 stores it in a local file tree. 175 The PDB structure is returned as a single string. 176 If obsolete==1, the file will be saved in a special file tree. 177 If uncompress is specified, a system utility will decompress the .gz 178 archive. Otherwise, Python gzip utility will handle it. 179 compression does nothing, as all archives are already in .gz format 180 181 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 182 @type pdir: string 183 184 @return: filename 185 @rtype: string 186 """ 187 # Alert the user about deprecated parameters 188 if compression is not None: 189 warnings.warn("PDB file servers now only host .gz archives: " 190 "the compression parameter will not do anything" 191 , BiopythonDeprecationWarning) 192 if uncompress is not None: 193 warnings.warn("Decompression is handled with the gzip module: " 194 "the uncompression parameter will not do anything" 195 , BiopythonDeprecationWarning) 196 197 # Get the structure 198 code=pdb_code.lower() 199 filename="pdb%s.ent.gz"%code 200 if not obsolete: 201 url=(self.pdb_server+ 202 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz' 203 % (code[1:3],code)) 204 else: 205 url=(self.pdb_server+ 206 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz' 207 % (code[1:3],code)) 208 209 # In which dir to put the pdb file? 210 if pdir is None: 211 if self.flat_tree: 212 if not obsolete: 213 path=self.local_pdb 214 else: 215 path=self.obsolete_pdb 216 else: 217 # Put in PDB-style directory tree 218 if not obsolete: 219 path=os.path.join(self.local_pdb, code[1:3]) 220 else: 221 path=os.path.join(self.obsolete_pdb,code[1:3]) 222 else: 223 # Put in specified directory 224 path=pdir 225 226 if not os.access(path,os.F_OK): 227 os.makedirs(path) 228 229 filename=os.path.join(path, filename) 230 # the final uncompressed file 231 final_file=os.path.join(path, "pdb%s.ent" % code) 232 233 # Skip download if the file already exists 234 if not self.overwrite: 235 if os.path.exists(final_file): 236 print "Structure exists: '%s' " % final_file 237 return final_file 238 239 # Retrieve the file 240 print "Downloading PDB structure '%s'..." % pdb_code 241 lines = _urlopen(url).read() 242 open(filename,'wb').write(lines) 243 244 # Uncompress the file 245 gz = gzip.open(filename, 'rb') 246 out = open(final_file, 'wb') 247 out.writelines(gz.read()) 248 gz.close() 249 out.close() 250 os.remove(filename) 251 252 return final_file
253 254
255 - def update_pdb(self):
256 """ 257 I guess this is the 'most wanted' function from this module. 258 It gets the weekly lists of new and modified pdb entries and 259 automatically downloads the according PDB files. 260 You can call this module as a weekly cronjob. 261 """ 262 assert os.path.isdir(self.local_pdb) 263 assert os.path.isdir(self.obsolete_pdb) 264 265 new, modified, obsolete = self.get_recent_changes() 266 267 for pdb_code in new+modified: 268 try: 269 self.retrieve_pdb_file(pdb_code) 270 except Exception: 271 print 'error %s\n' % pdb_code 272 # you can insert here some more log notes that 273 # something has gone wrong. 274 275 # Move the obsolete files to a special folder 276 for pdb_code in obsolete: 277 if self.flat_tree: 278 old_file = os.path.join(self.local_pdb, 279 'pdb%s.ent' % pdb_code) 280 new_dir = self.obsolete_pdb 281 else: 282 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 283 'pdb%s.ent' % pdb_code) 284 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 285 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 286 if os.path.isfile(old_file): 287 if not os.path.isdir(new_dir): 288 os.mkdir(new_dir) 289 try: 290 shutil.move(old_file, new_file) 291 except Exception: 292 print "Could not move %s to obsolete folder" % old_file 293 elif os.path.isfile(new_file): 294 print "Obsolete file %s already moved" % old_file 295 else: 296 print "Obsolete file %s is missing" % old_file
297 298
299 - def download_entire_pdb(self, listfile=None):
300 """Retrieve all PDB entries not present in the local PDB copy. 301 302 Writes a list file containing all PDB codes (optional, if listfile is 303 given). 304 """ 305 entries = self.get_all_entries() 306 for pdb_code in entries: 307 self.retrieve_pdb_file(pdb_code) 308 # Write the list 309 if listfile: 310 outfile = open(listfile, 'w') 311 outfile.writelines((x+'\n' for x in entries)) 312 outfile.close()
313
314 - def download_obsolete_entries(self, listfile=None):
315 """Retrieve all obsolete PDB entries not present in the local obsolete 316 PDB copy. 317 318 Writes a list file containing all PDB codes (optional, if listfile is 319 given). 320 """ 321 entries = self.get_all_obsolete() 322 for pdb_code in entries: 323 self.retrieve_pdb_file(pdb_code, obsolete=1) 324 325 # Write the list 326 if listfile: 327 outfile = open(listfile, 'w') 328 outfile.writelines((x+'\n' for x in entries)) 329 outfile.close()
330
331 - def get_seqres_file(self,savefile='pdb_seqres.txt'):
332 """Retrieves a (big) file containing all the sequences of PDB entries 333 and writes it to a file. 334 """ 335 print "retrieving sequence file. Takes about 15 MB." 336 handle = _urlopen(self.pdb_server + 337 '/pub/pdb/derived_data/pdb_seqres.txt') 338 lines = handle.readlines() 339 outfile = open(savefile, 'w') 340 outfile.writelines(lines) 341 outfile.close() 342 handle.close()
343 344 345 if __name__ == '__main__': 346 347 import sys 348 349 doc = """PDBList.py 350 (c) Kristian Rother 2003, Contributed to BioPython 351 352 Usage: 353 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 354 local pdb tree. 355 PDBList.py all <pdb_path> [options] - write all PDB entries to 356 local pdb tree. 357 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 358 entries to local pdb tree. 359 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 360 361 Options: 362 -d A single directory will be used as <pdb_path>, not a tree. 363 -o Overwrite existing structure files. 364 """ 365 print doc 366 367 if len(sys.argv)>2: 368 pdb_path = sys.argv[2] 369 pl = PDBList(pdb=pdb_path) 370 if len(sys.argv)>3: 371 for option in sys.argv[3:]: 372 if option == '-d': pl.flat_tree = 1 373 elif option == '-o': pl.overwrite = 1 374 375 else: 376 pdb_path = os.getcwd() 377 pl = PDBList() 378 pl.flat_tree = 1 379 380 if len(sys.argv) > 1: 381 if sys.argv[1] == 'update': 382 # update PDB 383 print "updating local PDB at "+pdb_path 384 pl.update_pdb() 385 386 elif sys.argv[1] == 'all': 387 # get the entire PDB 388 pl.download_entire_pdb() 389 390 elif sys.argv[1] == 'obsol': 391 # get all obsolete entries 392 pl.download_obsolete_entries(pdb_path) 393 394 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 395 # get single PDB entry 396 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path) 397