1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """Access the PDB over the internet (for example to download structures)."""
24
25 import gzip
26 import os
27 import shutil
28 from urllib2 import urlopen as _urlopen
29 import warnings
30
31 from Bio import BiopythonDeprecationWarning
32
33
35 """
36 This class provides quick access to the structure lists on the
37 PDB server or its mirrors. The structure lists contain
38 four-letter PDB codes, indicating that structures are
39 new, have been modified or are obsolete. The lists are released
40 on a weekly basis.
41
42 It also provides a function to retrieve PDB files from the server.
43 To use it properly, prepare a directory /pdb or the like,
44 where PDB files are stored.
45
46 If You want to use this module from inside a proxy, add
47 the proxy variable to Your environment, e.g. in Unix
48 export HTTP_PROXY='http://realproxy.charite.de:888'
49 (This can also be added to ~/.bashrc)
50 """
51
52 PDB_REF="""
53 The Protein Data Bank: a computer-based archival file for macromolecular structures.
54 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
55 J. Mol. Biol. 112 pp. 535-542 (1977)
56 http://www.pdb.org/.
57 """
58
59 alternative_download_url = "http://www.rcsb.org/pdb/files/"
60
61
62 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
63 """Initialize the class with the default server or a custom one."""
64
65 self.pdb_server = server
66
67
68 self.local_pdb = pdb
69
70
71 if obsolete_pdb:
72 self.obsolete_pdb = obsolete_pdb
73 else:
74 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete')
75 if not os.access(self.obsolete_pdb,os.F_OK):
76 os.makedirs(self.obsolete_pdb)
77
78
79 self.overwrite = 0
80 self.flat_tree = 0
81
82
84 """Retrieves a list of pdb codes in the weekly pdb status file
85 from the given URL. Used by get_recent_files.
86
87 Typical contents of the list files parsed by this method is now
88 very simply one PDB name per line.
89 """
90 handle = _urlopen(url)
91 answer = []
92 for line in handle:
93 pdb = line.strip()
94 assert len(pdb)==4
95 answer.append(pdb)
96 handle.close()
97 return answer
98
99
101 """Returns three lists of the newest weekly files (added,mod,obsolete).
102
103 Reads the directories with changed entries from the PDB server and
104 returns a tuple of three URL's to the files of new, modified and
105 obsolete entries from the most recent list. The directory with the
106 largest numerical name is used.
107 Returns None if something goes wrong.
108
109 Contents of the data/status dir (20031013 would be used);
110 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006
111 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013
112 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README
113 """
114 url = _urlopen(self.pdb_server + '/pub/pdb/data/status/')
115 recent = filter(str.isdigit,
116 (x.split()[-1] for x in url.readlines())
117 )[-1]
118 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent)
119
120 added = self.get_status_list(path+'added.pdb')
121 modified = self.get_status_list(path+'modified.pdb')
122 obsolete = self.get_status_list(path+'obsolete.pdb')
123 return [added,modified,obsolete]
124
126 """Retrieves a big file containing all the
127 PDB entries and some annotation to them.
128 Returns a list of PDB codes in the index file.
129 """
130 print "retrieving index file. Takes about 5 MB."
131 url = _urlopen(self.pdb_server +
132 '/pub/pdb/derived_data/index/entries.idx')
133 return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
134
136 """Returns a list of all obsolete entries ever in the PDB.
137
138 Returns a list of all obsolete pdb codes that have ever been
139 in the PDB.
140
141 Gets and parses the file from the PDB server in the format
142 (the first pdb_code column is the one used). The file looks
143 like this:
144
145 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
146 OBSLTE 31-JUL-94 116L 216L
147 ...
148 OBSLTE 29-JAN-96 1HFT 2HFT
149 OBSLTE 21-SEP-06 1HFV 2J5X
150 OBSLTE 21-NOV-03 1HG6
151 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB
152 OBSLTE 08-NOV-96 1HID 2HID
153 OBSLTE 01-APR-97 1HIU 2HIU
154 OBSLTE 14-JAN-04 1HKE 1UUZ
155 ...
156
157 """
158 handle = _urlopen(self.pdb_server +
159 '/pub/pdb/data/status/obsolete.dat')
160
161
162 obsolete = []
163 for line in handle:
164 if not line.startswith("OBSLTE ") : continue
165 pdb = line.split()[2]
166 assert len(pdb)==4
167 obsolete.append(pdb)
168 handle.close()
169 return obsolete
170
171 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression=None,
172 uncompress=None, pdir=None):
173 """ Retrieves a PDB structure file from the PDB server and
174 stores it in a local file tree.
175 The PDB structure is returned as a single string.
176 If obsolete==1, the file will be saved in a special file tree.
177 If uncompress is specified, a system utility will decompress the .gz
178 archive. Otherwise, Python gzip utility will handle it.
179 compression does nothing, as all archives are already in .gz format
180
181 @param pdir: put the file in this directory (default: create a PDB-style directory tree)
182 @type pdir: string
183
184 @return: filename
185 @rtype: string
186 """
187
188 if compression is not None:
189 warnings.warn("PDB file servers now only host .gz archives: "
190 "the compression parameter will not do anything"
191 , BiopythonDeprecationWarning)
192 if uncompress is not None:
193 warnings.warn("Decompression is handled with the gzip module: "
194 "the uncompression parameter will not do anything"
195 , BiopythonDeprecationWarning)
196
197
198 code=pdb_code.lower()
199 filename="pdb%s.ent.gz"%code
200 if not obsolete:
201 url=(self.pdb_server+
202 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz'
203 % (code[1:3],code))
204 else:
205 url=(self.pdb_server+
206 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz'
207 % (code[1:3],code))
208
209
210 if pdir is None:
211 if self.flat_tree:
212 if not obsolete:
213 path=self.local_pdb
214 else:
215 path=self.obsolete_pdb
216 else:
217
218 if not obsolete:
219 path=os.path.join(self.local_pdb, code[1:3])
220 else:
221 path=os.path.join(self.obsolete_pdb,code[1:3])
222 else:
223
224 path=pdir
225
226 if not os.access(path,os.F_OK):
227 os.makedirs(path)
228
229 filename=os.path.join(path, filename)
230
231 final_file=os.path.join(path, "pdb%s.ent" % code)
232
233
234 if not self.overwrite:
235 if os.path.exists(final_file):
236 print "Structure exists: '%s' " % final_file
237 return final_file
238
239
240 print "Downloading PDB structure '%s'..." % pdb_code
241 lines = _urlopen(url).read()
242 open(filename,'wb').write(lines)
243
244
245 gz = gzip.open(filename, 'rb')
246 out = open(final_file, 'wb')
247 out.writelines(gz.read())
248 gz.close()
249 out.close()
250 os.remove(filename)
251
252 return final_file
253
254
256 """
257 I guess this is the 'most wanted' function from this module.
258 It gets the weekly lists of new and modified pdb entries and
259 automatically downloads the according PDB files.
260 You can call this module as a weekly cronjob.
261 """
262 assert os.path.isdir(self.local_pdb)
263 assert os.path.isdir(self.obsolete_pdb)
264
265 new, modified, obsolete = self.get_recent_changes()
266
267 for pdb_code in new+modified:
268 try:
269 self.retrieve_pdb_file(pdb_code)
270 except Exception:
271 print 'error %s\n' % pdb_code
272
273
274
275
276 for pdb_code in obsolete:
277 if self.flat_tree:
278 old_file = os.path.join(self.local_pdb,
279 'pdb%s.ent' % pdb_code)
280 new_dir = self.obsolete_pdb
281 else:
282 old_file = os.path.join(self.local_pdb, pdb_code[1:3],
283 'pdb%s.ent' % pdb_code)
284 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3])
285 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code)
286 if os.path.isfile(old_file):
287 if not os.path.isdir(new_dir):
288 os.mkdir(new_dir)
289 try:
290 shutil.move(old_file, new_file)
291 except Exception:
292 print "Could not move %s to obsolete folder" % old_file
293 elif os.path.isfile(new_file):
294 print "Obsolete file %s already moved" % old_file
295 else:
296 print "Obsolete file %s is missing" % old_file
297
298
300 """Retrieve all PDB entries not present in the local PDB copy.
301
302 Writes a list file containing all PDB codes (optional, if listfile is
303 given).
304 """
305 entries = self.get_all_entries()
306 for pdb_code in entries:
307 self.retrieve_pdb_file(pdb_code)
308
309 if listfile:
310 outfile = open(listfile, 'w')
311 outfile.writelines((x+'\n' for x in entries))
312 outfile.close()
313
315 """Retrieve all obsolete PDB entries not present in the local obsolete
316 PDB copy.
317
318 Writes a list file containing all PDB codes (optional, if listfile is
319 given).
320 """
321 entries = self.get_all_obsolete()
322 for pdb_code in entries:
323 self.retrieve_pdb_file(pdb_code, obsolete=1)
324
325
326 if listfile:
327 outfile = open(listfile, 'w')
328 outfile.writelines((x+'\n' for x in entries))
329 outfile.close()
330
332 """Retrieves a (big) file containing all the sequences of PDB entries
333 and writes it to a file.
334 """
335 print "retrieving sequence file. Takes about 15 MB."
336 handle = _urlopen(self.pdb_server +
337 '/pub/pdb/derived_data/pdb_seqres.txt')
338 lines = handle.readlines()
339 outfile = open(savefile, 'w')
340 outfile.writelines(lines)
341 outfile.close()
342 handle.close()
343
344
345 if __name__ == '__main__':
346
347 import sys
348
349 doc = """PDBList.py
350 (c) Kristian Rother 2003, Contributed to BioPython
351
352 Usage:
353 PDBList.py update <pdb_path> [options] - write weekly PDB updates to
354 local pdb tree.
355 PDBList.py all <pdb_path> [options] - write all PDB entries to
356 local pdb tree.
357 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB
358 entries to local pdb tree.
359 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure
360
361 Options:
362 -d A single directory will be used as <pdb_path>, not a tree.
363 -o Overwrite existing structure files.
364 """
365 print doc
366
367 if len(sys.argv)>2:
368 pdb_path = sys.argv[2]
369 pl = PDBList(pdb=pdb_path)
370 if len(sys.argv)>3:
371 for option in sys.argv[3:]:
372 if option == '-d': pl.flat_tree = 1
373 elif option == '-o': pl.overwrite = 1
374
375 else:
376 pdb_path = os.getcwd()
377 pl = PDBList()
378 pl.flat_tree = 1
379
380 if len(sys.argv) > 1:
381 if sys.argv[1] == 'update':
382
383 print "updating local PDB at "+pdb_path
384 pl.update_pdb()
385
386 elif sys.argv[1] == 'all':
387
388 pl.download_entire_pdb()
389
390 elif sys.argv[1] == 'obsol':
391
392 pl.download_obsolete_entries(pdb_path)
393
394 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit():
395
396 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path)
397