Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  """Simple protein analysis. 
  5   
  6  Example, 
  7   
  8  X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 
  9  print X.count_amino_acids() 
 10  print X.get_amino_acids_percent() 
 11  print X.molecular_weight() 
 12  print X.aromaticity() 
 13  print X.instability_index() 
 14  print X.flexibility() 
 15  print X.isoelectric_point() 
 16  print X.secondary_structure_fraction() 
 17  print X.protein_scale(ProtParamData.kd, 9, 0.4) 
 18  """ 
 19   
 20  import sys 
 21  import ProtParamData, IsoelectricPoint 
 22  from ProtParamData import kd  # Added by Iddo to enable the gravy method 
 23  from Bio.Seq import Seq 
 24  from Bio.Alphabet import IUPAC 
 25  from Bio.Data import IUPACData 
 26  #from BioModule import  
 27   
28 -class ProteinAnalysis(object):
29 """Class containing methods for protein analysis. 30 31 The class init method takes only one argument, the protein sequence as a 32 string and builds a sequence object using the Bio.Seq module. This is done 33 just to make sure the sequence is a protein sequence and not anything else. 34 35 """
36 - def __init__(self, ProtSequence):
37 if ProtSequence.islower(): 38 self.sequence = Seq(ProtSequence.upper(), IUPAC.protein) 39 else: 40 self.sequence = Seq(ProtSequence, IUPAC.protein) 41 self.amino_acids_content = None 42 self.amino_acids_percent = None 43 self.length = len(self.sequence)
44
45 - def count_amino_acids(self):
46 """Count standard amino acids, returns a dict. 47 48 Simply counts the number times an amino acid is repeated in the protein 49 sequence. Returns a dictionary {AminoAcid:Number} and also stores the 50 dictionary in self.amino_acids_content. 51 """ 52 ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters]) 53 for i in ProtDic: 54 ProtDic[i]=self.sequence.count(i) 55 self.amino_acids_content = ProtDic 56 return ProtDic
57
58 - def get_amino_acids_percent(self):
59 """Calculate the amino acid content in percents. 60 61 The same as count_amino_acids only returns the Number in percentage of 62 entire sequence. Returns a dictionary and stores the dictionary in 63 self.amino_acids_content_percent. 64 65 input is the dictionary from CountAA. 66 output is a dictionary with AA as keys. 67 """ 68 if not self.amino_acids_content: 69 self.count_amino_acids() 70 71 PercentAA = {} 72 for i in self.amino_acids_content: 73 if self.amino_acids_content[i] > 0: 74 PercentAA[i]=self.amino_acids_content[i]/float(self.length) 75 else: 76 PercentAA[i] = 0 77 self.amino_acids_percent = PercentAA 78 return PercentAA
79
80 - def molecular_weight (self):
81 """Calculate MW from Protein sequence""" 82 # make local dictionary for speed 83 MwDict = {} 84 # remove a molecule of water from the amino acid weight. 85 for i in IUPACData.protein_weights: 86 MwDict[i] = IUPACData.protein_weights[i] - 18.02 87 MW = 18.02 # add just one water molecule for the whole sequence. 88 for i in self.sequence: 89 MW += MwDict[i] 90 return MW
91
92 - def aromaticity(self):
93 """Calculate the aromaticity according to Lobry, 1994. 94 95 Calculates the aromaticity value of a protein according to Lobry, 1994. 96 It is simply the relative frequency of Phe+Trp+Tyr. 97 """ 98 if not self.amino_acids_percent: 99 self.get_amino_acids_percent() 100 101 Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F'] 102 return Arom
103
104 - def instability_index(self):
105 """Calculate the instability index according to Guruprasad et al 1990. 106 107 Implementation of the method of Guruprasad et al. 1990 to test a 108 protein for stability. Any value above 40 means the protein is unstable 109 (has a short half life). 110 111 See: Guruprasad K., Reddy B.V.B., Pandit M.W. 112 Protein Engineering 4:155-161(1990). 113 """ 114 #make the dictionary local for speed. 115 DIWV=ProtParamData.DIWV.copy() 116 score=0.0 117 for i in range(self.length - 1): 118 DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]] 119 score += DiPeptide 120 return (10.0/self.length) * score
121
122 - def flexibility(self):
123 """Calculate the flexibility according to Vihinen, 1994. 124 125 No argument to change window size because parameters are specific for a 126 window=9. The parameters used are optimized for determining the flexibility. 127 """ 128 Flex = ProtParamData.Flex.copy() 129 Window=9 130 Weights=[0.25,0.4375,0.625,0.8125,1] 131 List=[] 132 for i in range(self.length - Window): 133 SubSeq=self.sequence[i:i+Window] 134 score = 0.0 135 for j in range(Window//2): 136 score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j] 137 score += Flex[SubSeq[Window//2+1]] 138 List.append(score/5.25) 139 return List
140
141 - def gravy(self):
142 """Calculate the gravy according to Kyte and Doolittle.""" 143 ProtGravy=0.0 144 for i in self.sequence: 145 ProtGravy += kd[i] 146 147 return ProtGravy/self.length
148 149 # this method is used to make a list of relative weight of the 150 # window edges compared to the window center. The weights are linear. 151 # it actually generates half a list. For a window of size 9 and edge 0.4 152 # you get a list of [0.4, 0.55, 0.7, 0.85].
153 - def _weight_list(self, window, edge):
154 unit = ((1.0-edge)/(window-1))*2 155 list = [0.0]*(window//2) 156 for i in range(window//2): 157 list[i] = edge + unit * i 158 return list
159 160 # The weight list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 161 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 162 # in the loop.
163 - def protein_scale(self, ParamDict, Window, Edge=1.0):
164 """Compute a profile by any amino acid scale. 165 166 An amino acid scale is defined by a numerical value assigned to each type of 167 amino acid. The most frequently used scales are the hydrophobicity or 168 hydrophilicity scales and the secondary structure conformational parameters 169 scales, but many other scales exist which are based on different chemical and 170 physical properties of the amino acids. You can set several parameters that 171 control the computation of a scale profile, such as the window size and the 172 window edge relative weight value. WindowSize: The window size is the length 173 of the interval to use for the profile computation. For a window size n, we 174 use the i- ( n-1)/2 neighboring residues on each side of residue it compute 175 the score for residue i. The score for residue is the sum of the scale values 176 for these amino acids, optionally weighted according to their position in the 177 window. Edge: The central amino acid of the window always has a weight of 1. 178 By default, the amino acids at the remaining window positions have the same 179 weight, but you can make the residue at the center of the window have a 180 larger weight than the others by setting the edge value for the residues at 181 the beginning and end of the interval to a value between 0 and 1. For 182 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 183 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to 184 view the change along a protein sequence. Many scales exist. Just add your 185 favorites to the ProtParamData modules. 186 187 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 188 """ 189 # generate the weights 190 weight = self._weight_list(Window,Edge) 191 list = [] 192 # the score in each Window is divided by the sum of weights 193 sum_of_weights = 0.0 194 for i in weight: sum_of_weights += i 195 # since the weight list is one sided: 196 sum_of_weights = sum_of_weights*2+1 197 198 for i in range(self.length-Window+1): 199 subsequence = self.sequence[i:i+Window] 200 score = 0.0 201 for j in range(Window//2): 202 # walk from the outside of the Window towards the middle. 203 # Iddo: try/except clauses added to avoid raising an exception on a non-standad amino acid 204 try: 205 score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]] 206 except KeyError: 207 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 208 (subsequence[j],subsequence[Window-j-1])) 209 210 # Now add the middle value, which always has a weight of 1. 211 if subsequence[Window//2] in ParamDict: 212 score += ParamDict[subsequence[Window//2]] 213 else: 214 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (subsequence[Window//2])) 215 216 list.append(score/sum_of_weights) 217 return list
218
219 - def isoelectric_point(self):
220 """Calculate the isoelectric point. 221 222 This method uses the module IsoelectricPoint to calculate the pI of a protein. 223 """ 224 if not self.amino_acids_content: 225 self.count_amino_acids() 226 X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content) 227 return X.pi()
228
230 """Calculate fraction of helix, turn and sheet. 231 232 This methods returns a list of the fraction of amino acids which tend 233 to be in Helix, Turn or Sheet. 234 235 Amino acids in helix: V, I, Y, F, W, L. 236 Amino acids in Turn: N, P, G, S. 237 Amino acids in sheet: E, M, A, L. 238 239 Returns a tuple of three integers (Helix, Turn, Sheet). 240 """ 241 if not self.amino_acids_percent: 242 self.get_amino_acids_percent() 243 Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L'] 244 Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S'] 245 Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L'] 246 return Helix, Turn, Sheet
247