Package Bio :: Package PDB :: Module Polypeptide
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.Polypeptide

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Polypeptide-related classes (construction and representation). 
  7   
  8  Simple example with multiple chains, 
  9   
 10      >>> from Bio.PDB.PDBParser import PDBParser 
 11      >>> from Bio.PDB.Polypeptide import PPBuilder 
 12      >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') 
 13      >>> ppb=PPBuilder() 
 14      >>> for pp in ppb.build_peptides(structure): 
 15      ...     print pp.get_sequence() 
 16      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 17      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 18      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 19      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 20      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 21   
 22  Example with non-standard amino acids using HETATM lines in the PDB file, 
 23  in this case selenomethionine (MSE): 
 24   
 25      >>> from Bio.PDB.PDBParser import PDBParser 
 26      >>> from Bio.PDB.Polypeptide import PPBuilder 
 27      >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') 
 28      >>> ppb=PPBuilder() 
 29      >>> for pp in ppb.build_peptides(structure): 
 30      ...     print pp.get_sequence() 
 31      DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW 
 32      TETLLVQNANPDCKTILKALGPGATLEE 
 33      TACQG 
 34   
 35  If you want to, you can include non-standard amino acids in the peptides: 
 36   
 37      >>> for pp in ppb.build_peptides(structure, aa_only=False): 
 38      ...     print pp.get_sequence() 
 39      ...     print pp.get_sequence()[0], pp[0].get_resname() 
 40      ...     print pp.get_sequence()[-7], pp[-7].get_resname() 
 41      ...     print pp.get_sequence()[-6], pp[-6].get_resname() 
 42      MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG 
 43      M MSE 
 44      M MSE 
 45      M MSE 
 46   
 47  In this case the selenomethionines (the first and also seventh and sixth from 
 48  last residues) have been shown as M (methionine) by the get_sequence method. 
 49  """ 
 50   
 51  import warnings 
 52   
 53  from Bio.Alphabet import generic_protein 
 54  from Bio.Seq import Seq 
 55  from Bio.SCOP.Raf import to_one_letter_code 
 56  from Bio.PDB.PDBExceptions import PDBException 
 57  from Bio.PDB.Residue import Residue, DisorderedResidue 
 58  from Bio.PDB.Vector import calc_dihedral, calc_angle 
 59   
 60   
 61  standard_aa_names=["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS",  
 62                     "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", 
 63                     "TRP", "TYR"] 
 64   
 65   
 66  aa1="ACDEFGHIKLMNPQRSTVWY" 
 67  aa3=standard_aa_names 
 68   
 69  d1_to_index={} 
 70  dindex_to_1={} 
 71  d3_to_index={} 
 72  dindex_to_3={} 
 73   
 74  # Create some lookup tables 
 75  for i in range(0, 20): 
 76      n1=aa1[i] 
 77      n3=aa3[i] 
 78      d1_to_index[n1]=i 
 79      dindex_to_1[i]=n1 
 80      d3_to_index[n3]=i 
 81      dindex_to_3[i]=n3 
 82   
83 -def index_to_one(index):
84 """Index to corresponding one letter amino acid name. 85 86 >>> index_to_one(0) 87 'A' 88 >>> index_to_one(19) 89 'Y' 90 """ 91 return dindex_to_1[index]
92
93 -def one_to_index(s):
94 """One letter code to index. 95 96 >>> one_to_index('A') 97 0 98 >>> one_to_index('Y') 99 19 100 """ 101 return d1_to_index[s]
102
103 -def index_to_three(i):
104 """Index to corresponding three letter amino acid name. 105 106 >>> index_to_three(0) 107 'ALA' 108 >>> index_to_three(19) 109 'TYR' 110 """ 111 return dindex_to_3[i]
112
113 -def three_to_index(s):
114 """Three letter code to index. 115 116 >>> three_to_index('ALA') 117 0 118 >>> three_to_index('TYR') 119 19 120 """ 121 return d3_to_index[s]
122
123 -def three_to_one(s):
124 """Three letter code to one letter code. 125 126 >>> three_to_one('ALA') 127 'A' 128 >>> three_to_one('TYR') 129 'Y' 130 131 For non-standard amino acids, you get a KeyError: 132 133 >>> three_to_one('MSE') 134 Traceback (most recent call last): 135 ... 136 KeyError: 'MSE' 137 """ 138 i=d3_to_index[s] 139 return dindex_to_1[i]
140
141 -def one_to_three(s):
142 """One letter code to three letter code. 143 144 >>> one_to_three('A') 145 'ALA' 146 >>> one_to_three('Y') 147 'TYR' 148 """ 149 i=d1_to_index[s] 150 return dindex_to_3[i]
151
152 -def is_aa(residue, standard=False):
153 """Return True if residue object/string is an amino acid. 154 155 @param residue: a L{Residue} object OR a three letter amino acid code 156 @type residue: L{Residue} or string 157 158 @param standard: flag to check for the 20 AA (default false) 159 @type standard: boolean 160 161 >>> is_aa('ALA') 162 True 163 164 Known three letter codes for modified amino acids are supported, 165 166 >>> is_aa('FME') 167 True 168 >>> is_aa('FME', standard=True) 169 False 170 """ 171 #TODO - What about special cases like XXX, can they appear in PDB files? 172 if not isinstance(residue, basestring): 173 residue=residue.get_resname() 174 residue=residue.upper() 175 if standard: 176 return residue in d3_to_index 177 else: 178 return residue in to_one_letter_code
179 180
181 -class Polypeptide(list):
182 """A polypeptide is simply a list of L{Residue} objects."""
183 - def get_ca_list(self):
184 """Get list of C-alpha atoms in the polypeptide. 185 186 @return: the list of C-alpha atoms 187 @rtype: [L{Atom}, L{Atom}, ...] 188 """ 189 ca_list=[] 190 for res in self: 191 ca=res["CA"] 192 ca_list.append(ca) 193 return ca_list
194
195 - def get_phi_psi_list(self):
196 """Return the list of phi/psi dihedral angles.""" 197 ppl=[] 198 lng=len(self) 199 for i in range(0, lng): 200 res=self[i] 201 try: 202 n=res['N'].get_vector() 203 ca=res['CA'].get_vector() 204 c=res['C'].get_vector() 205 except: 206 # Some atoms are missing 207 # Phi/Psi cannot be calculated for this residue 208 ppl.append((None, None)) 209 res.xtra["PHI"]=None 210 res.xtra["PSI"]=None 211 continue 212 # Phi 213 if i>0: 214 rp=self[i-1] 215 try: 216 cp=rp['C'].get_vector() 217 phi=calc_dihedral(cp, n, ca, c) 218 except: 219 phi=None 220 else: 221 # No phi for residue 0! 222 phi=None 223 # Psi 224 if i<(lng-1): 225 rn=self[i+1] 226 try: 227 nn=rn['N'].get_vector() 228 psi=calc_dihedral(n, ca, c, nn) 229 except: 230 psi=None 231 else: 232 # No psi for last residue! 233 psi=None 234 ppl.append((phi, psi)) 235 # Add Phi/Psi to xtra dict of residue 236 res.xtra["PHI"]=phi 237 res.xtra["PSI"]=psi 238 return ppl
239
240 - def get_tau_list(self):
241 """List of tau torsions angles for all 4 consecutive Calpha atoms.""" 242 ca_list=self.get_ca_list() 243 tau_list=[] 244 for i in range(0, len(ca_list)-3): 245 atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2], ca_list[i+3]) 246 v1, v2, v3, v4 = [a.get_vector() for a in atom_list] 247 tau=calc_dihedral(v1, v2, v3, v4) 248 tau_list.append(tau) 249 # Put tau in xtra dict of residue 250 res=ca_list[i+2].get_parent() 251 res.xtra["TAU"]=tau 252 return tau_list
253
254 - def get_theta_list(self):
255 """List of theta angles for all 3 consecutive Calpha atoms.""" 256 theta_list=[] 257 ca_list=self.get_ca_list() 258 for i in range(0, len(ca_list)-2): 259 atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2]) 260 v1, v2, v3 = [a.get_vector() for a in atom_list] 261 theta=calc_angle(v1, v2, v3) 262 theta_list.append(theta) 263 # Put tau in xtra dict of residue 264 res=ca_list[i+1].get_parent() 265 res.xtra["THETA"]=theta 266 return theta_list
267
268 - def get_sequence(self):
269 """Return the AA sequence as a Seq object. 270 271 @return: polypeptide sequence 272 @rtype: L{Seq} 273 """ 274 s="" 275 for res in self: 276 s += to_one_letter_code.get(res.get_resname(), 'X') 277 seq=Seq(s, generic_protein) 278 return seq
279
280 - def __repr__(self):
281 """Return string representation of the polypeptide. 282 283 Return <Polypeptide start=START end=END>, where START 284 and END are sequence identifiers of the outer residues. 285 """ 286 start=self[0].get_id()[1] 287 end=self[-1].get_id()[1] 288 s="<Polypeptide start=%s end=%s>" % (start, end) 289 return s
290
291 -class _PPBuilder:
292 """Base class to extract polypeptides. 293 294 It checks if two consecutive residues in a chain are connected. 295 The connectivity test is implemented by a subclass. 296 297 This assumes you want both standard and non-standard amino acids. 298 """
299 - def __init__(self, radius):
300 """ 301 @param radius: distance 302 @type radius: float 303 """ 304 self.radius=radius
305
306 - def _accept(self, residue, standard_aa_only):
307 """Check if the residue is an amino acid (PRIVATE).""" 308 if is_aa(residue, standard=standard_aa_only): 309 return True 310 elif not standard_aa_only and "CA" in residue.child_dict: 311 #It has an alpha carbon... 312 #We probably need to update the hard coded list of 313 #non-standard residues, see function is_aa for details. 314 warnings.warn("Assuming residue %s is an unknown modified " 315 "amino acid" % residue.get_resname()) 316 return True 317 else: 318 # not a standard AA so skip 319 return False
320
321 - def build_peptides(self, entity, aa_only=1):
322 """Build and return a list of Polypeptide objects. 323 324 @param entity: polypeptides are searched for in this object 325 @type entity: L{Structure}, L{Model} or L{Chain} 326 327 @param aa_only: if 1, the residue needs to be a standard AA 328 @type aa_only: int 329 """ 330 is_connected=self._is_connected 331 accept=self._accept 332 level=entity.get_level() 333 # Decide wich entity we are dealing with 334 if level=="S": 335 model=entity[0] 336 chain_list=model.get_list() 337 elif level=="M": 338 chain_list=entity.get_list() 339 elif level=="C": 340 chain_list=[entity] 341 else: 342 raise PDBException("Entity should be Structure, Model or Chain.") 343 pp_list=[] 344 for chain in chain_list: 345 chain_it=iter(chain) 346 try: 347 prev_res = chain_it.next() 348 while not accept(prev_res, aa_only): 349 prev_res = chain_it.next() 350 except StopIteration: 351 #No interesting residues at all in this chain 352 continue 353 pp=None 354 for next_res in chain_it: 355 if accept(prev_res, aa_only) \ 356 and accept(next_res, aa_only) \ 357 and is_connected(prev_res, next_res): 358 if pp is None: 359 pp=Polypeptide() 360 pp.append(prev_res) 361 pp_list.append(pp) 362 pp.append(next_res) 363 else: 364 #Either too far apart, or one of the residues is unwanted. 365 #End the current peptide 366 pp=None 367 prev_res=next_res 368 return pp_list
369 370
371 -class CaPPBuilder(_PPBuilder):
372 """Use CA--CA distance to find polypeptides."""
373 - def __init__(self, radius=4.3):
374 _PPBuilder.__init__(self, radius)
375
376 - def _is_connected(self, prev_res, next_res):
377 for r in [prev_res, next_res]: 378 if not r.has_id("CA"): 379 return False 380 n=next_res["CA"] 381 p=prev_res["CA"] 382 # Unpack disordered 383 if n.is_disordered(): 384 nlist=n.disordered_get_list() 385 else: 386 nlist=[n] 387 if p.is_disordered(): 388 plist=p.disordered_get_list() 389 else: 390 plist=[p] 391 for nn in nlist: 392 for pp in plist: 393 if (nn-pp)<self.radius: 394 return True 395 return False
396 397
398 -class PPBuilder(_PPBuilder):
399 """Use C--N distance to find polypeptides."""
400 - def __init__(self, radius=1.8):
401 _PPBuilder.__init__(self, radius)
402
403 - def _is_connected(self, prev_res, next_res):
404 if not prev_res.has_id("C"): 405 return False 406 if not next_res.has_id("N"): 407 return False 408 test_dist=self._test_dist 409 c=prev_res["C"] 410 n=next_res["N"] 411 # Test all disordered atom positions! 412 if c.is_disordered(): 413 clist=c.disordered_get_list() 414 else: 415 clist=[c] 416 if n.is_disordered(): 417 nlist=n.disordered_get_list() 418 else: 419 nlist=[n] 420 for nn in nlist: 421 for cc in clist: 422 # To form a peptide bond, N and C must be 423 # within radius and have the same altloc 424 # identifier or one altloc blank 425 n_altloc=nn.get_altloc() 426 c_altloc=cc.get_altloc() 427 if n_altloc==c_altloc or n_altloc==" " or c_altloc==" ": 428 if test_dist(nn, cc): 429 # Select the disordered atoms that 430 # are indeed bonded 431 if c.is_disordered(): 432 c.disordered_select(c_altloc) 433 if n.is_disordered(): 434 n.disordered_select(n_altloc) 435 return True 436 return False
437
438 - def _test_dist(self, c, n):
439 """Return 1 if distance between atoms<radius (PRIVATE).""" 440 if (c-n)<self.radius: 441 return 1 442 else: 443 return 0
444 445 446 if __name__=="__main__": 447 import sys 448 from Bio.PDB.PDBParser import PDBParser 449 450 p=PDBParser(PERMISSIVE=True) 451 452 s=p.get_structure("scr", sys.argv[1]) 453 454 ppb=PPBuilder() 455 456 print "C-N" 457 for pp in ppb.build_peptides(s): 458 print pp.get_sequence() 459 for pp in ppb.build_peptides(s[0]): 460 print pp.get_sequence() 461 for pp in ppb.build_peptides(s[0]["A"]): 462 print pp.get_sequence() 463 464 for pp in ppb.build_peptides(s): 465 for phi, psi in pp.get_phi_psi_list(): 466 print phi, psi 467 468 ppb=CaPPBuilder() 469 470 print "CA-CA" 471 for pp in ppb.build_peptides(s): 472 print pp.get_sequence() 473 for pp in ppb.build_peptides(s[0]): 474 print pp.get_sequence() 475 for pp in ppb.build_peptides(s[0]["A"]): 476 print pp.get_sequence() 477