1
2
3
4
5
6
7
8 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
9
10 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
11 records (representing the sequence of the molecule used in an experiment) to
12 the ATOM records (representing the atoms experimentally observed).
13
14 This data is derived from the Protein Data Bank CIF files. Known errors in the
15 CIF files are corrected manually, with the original PDB file serving as the
16 final arbiter in case of discrepancies.
17
18 Residues are referenced by residue ID. This consists of a the PDB residue
19 sequence number (upto 4 digits) and an optional PDB insertion code (an
20 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
21
22 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
23
24 to_one_letter_code -- A mapping from the 3-letter amino acid codes found
25 in PDB files to 1-letter codes. The 3-letter codes
26 include chemically modified residues.
27 """
28
29 from copy import copy
30
31 from Bio.SCOP.Residues import Residues
32
33
34
35 to_one_letter_code= {
36 'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
37 'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
38 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
39 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
40 '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A',
41 'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D',
42 'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A',
43 'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C',
44 'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C',
45 'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C',
46 'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C',
47 'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F',
48 'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E',
49 'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V',
50 'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P',
51 'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y',
52 'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E',
53 'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R',
54 'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W',
55 'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K',
56 'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A',
57 'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G',
58 'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H',
59 'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S',
60 'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C',
61 'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y',
62 'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C',
63 'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K',
64 'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W',
65 'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y',
66 'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G',
67 'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z',
68 'PYX':'C',
69 'UNK':'X'
70 }
71
72
74 """Convert RAF one-letter amino acid codes into IUPAC standard codes.
75
76 Letters are uppercased, and "." ("Unknown") is converted to "X".
77 """
78 if one_letter_code == '.':
79 return 'X'
80 else:
81 return one_letter_code.upper()
82
84 """An RAF file index.
85
86 The RAF file itself is about 50 MB. This index provides rapid, random
87 access of RAF records without having to load the entire file into memory.
88
89 The index key is a concatenation of the PDB ID and chain ID. e.g
90 "2drcA", "155c_". RAF uses an underscore to indicate blank
91 chain IDs.
92 """
93
95 """
96 Arguments:
97
98 filename -- The file to index
99 """
100 dict.__init__(self)
101 self.filename = filename
102
103 f = open(self.filename, "rU")
104 try:
105 position = 0
106 while True:
107 line = f.readline()
108 if not line: break
109 key = line[0:5]
110 if key != None:
111 self[key]=position
112 position = f.tell()
113 finally:
114 f.close()
115
117 """ Return an item from the indexed file. """
118 position = dict.__getitem__(self,key)
119
120 f = open(self.filename, "rU")
121 try:
122 f.seek(position)
123 line = f.readline()
124 record = SeqMap(line)
125 finally:
126 f.close()
127 return record
128
129
131 """Get the sequence map for a collection of residues.
132
133 residues -- A Residues instance, or a string that can be converted into
134 a Residues instance.
135 """
136 if isinstance(residues, basestring):
137 residues = Residues(residues)
138
139 pdbid = residues.pdbid
140 frags = residues.fragments
141 if not frags: frags =(('_','',''),)
142
143 seqMap = None
144 for frag in frags:
145 chainid = frag[0]
146 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
147 chainid = '_'
148 id = pdbid + chainid
149
150
151 sm = self[id]
152
153
154 start = 0
155 end = len(sm.res)
156 if frag[1] : start = int(sm.index(frag[1], chainid))
157 if frag[2] : end = int(sm.index(frag[2], chainid)+1)
158
159 sm = sm[start:end]
160
161 if seqMap == None:
162 seqMap = sm
163 else:
164 seqMap += sm
165
166 return seqMap
167
168
170 """An ASTRAL RAF (Rapid Access Format) Sequence Map.
171
172 This is a list like object; You can find the location of particular residues
173 with index(), slice this SeqMap into fragments, and glue fragments back
174 together with extend().
175
176 pdbid -- The PDB 4 character ID
177
178 pdb_datestamp -- From the PDB file
179
180 version -- The RAF format version. e.g. 0.01
181
182 flags -- RAF flags. (See release notes for more information.)
183
184 res -- A list of Res objects, one for each residue in this sequence map
185 """
186
188 self.pdbid = ''
189 self.pdb_datestamp = ''
190 self.version = ''
191 self.flags = ''
192 self.res = []
193 if line:
194 self._process(line)
195
196
198 """Parses a RAF record into a SeqMap object.
199 """
200 header_len = 38
201
202 line = line.rstrip()
203
204 if len(line)<header_len:
205 raise ValueError("Incomplete header: "+line)
206
207 self.pdbid = line[0:4]
208 chainid = line[4:5]
209
210 self.version = line[6:10]
211
212
213 if(self.version != "0.01" and self.version !="0.02"):
214 raise ValueError("Incompatible RAF version: "+self.version)
215
216 self.pdb_datestamp = line[14:20]
217 self.flags = line[21:27]
218
219 for i in range(header_len, len(line), 7):
220 f = line[i : i+7]
221 if len(f)!=7:
222 raise ValueError("Corrupt Field: ("+f+")")
223 r = Res()
224 r.chainid = chainid
225 r.resid = f[0:5].strip()
226 r.atom = normalize_letters(f[5:6])
227 r.seqres = normalize_letters(f[6:7])
228
229 self.res.append(r)
230
231
232 - def index(self, resid, chainid="_"):
233 for i in range(0, len(self.res)):
234 if self.res[i].resid == resid and self.res[i].chainid == chainid:
235 return i
236 raise KeyError("No such residue "+chainid+resid)
237
239 if not isinstance(index, slice):
240 raise NotImplementedError
241 s = copy(self)
242 s.res = s.res[index]
243 return s
244
246 """Append another Res object onto the list of residue mappings."""
247 self.res.append(res)
248
250 """Append another SeqMap onto the end of self.
251
252 Both SeqMaps must have the same PDB ID, PDB datestamp and
253 RAF version. The RAF flags are erased if they are inconsistent. This
254 may happen when fragments are taken from different chains.
255 """
256 if not isinstance(other, SeqMap):
257 raise TypeError("Can only extend a SeqMap with a SeqMap.")
258 if self.pdbid != other.pdbid:
259 raise TypeError("Cannot add fragments from different proteins")
260 if self.version != other.version:
261 raise TypeError("Incompatible rafs")
262 if self.pdb_datestamp != other.pdb_datestamp:
263 raise TypeError("Different pdb dates!")
264 if self.flags != other.flags:
265 self.flags = ''
266 self.res += other.res
267
271
276
277 - def getAtoms(self, pdb_handle, out_handle):
278 """Extract all relevant ATOM and HETATOM records from a PDB file.
279
280 The PDB file is scanned for ATOM and HETATOM records. If the
281 chain ID, residue ID (seqNum and iCode), and residue type match
282 a residue in this sequence map, then the record is echoed to the
283 output handle.
284
285 This is typically used to find the coordinates of a domain, or other
286 residue subset.
287
288 pdb_handle -- A handle to the relevant PDB file.
289
290 out_handle -- All output is written to this file like object.
291 """
292
293
294
295 resSet = {}
296 for r in self.res:
297 if r.atom=='X' :
298 continue
299 chainid = r.chainid
300 if chainid == '_':
301 chainid = ' '
302 resid = r.resid
303 resSet[(chainid,resid)] = r
304
305 resFound = {}
306 for line in pdb_handle.xreadlines():
307 if line.startswith("ATOM ") or line.startswith("HETATM"):
308 chainid = line[21:22]
309 resid = line[22:27].strip()
310 key = (chainid, resid)
311 if key in resSet:
312 res = resSet[key]
313 atom_aa = res.atom
314 resName = line[17:20]
315 if resName in to_one_letter_code:
316 if to_one_letter_code[resName] == atom_aa:
317 out_handle.write(line)
318 resFound[key] = res
319
320 if len(resSet) != len(resFound):
321
322
323
324
325 raise RuntimeError('I could not find at least one ATOM or HETATM' \
326 +' record for each and every residue in this sequence map.')
327
328
330 """ A single residue mapping from a RAF record.
331
332 chainid -- A single character chain ID.
333
334 resid -- The residue ID.
335
336 atom -- amino acid one-letter code from ATOM records.
337
338 seqres -- amino acid one-letter code from SEQRES records.
339 """
341 self.chainid = ''
342 self.resid = ''
343 self.atom = ''
344 self.seqres = ''
345
346
348 """Iterates over a RAF file, returning a SeqMap object for each line
349 in the file.
350
351 Arguments:
352
353 handle -- file-like object.
354 """
355 for line in handle:
356 yield SeqMap(line)
357