Package Bio :: Module MaxEntropy
[hide private]
[frames] | no frames]

Source Code for Module Bio.MaxEntropy

  1  # Copyright 2001 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  Maximum Entropy code. 
  8   
  9  Uses Improved Iterative Scaling: 
 10  XXX ref 
 11   
 12  # XXX need to define terminology 
 13   
 14  """ 
 15   
 16  import numpy 
 17   
18 -class MaxEntropy(object):
19 """Holds information for a Maximum Entropy classifier. 20 21 Members: 22 classes List of the possible classes of data. 23 alphas List of the weights for each feature. 24 feature_fns List of the feature functions. 25 26 """
27 - def __init__(self):
28 self.classes = [] 29 self.alphas = [] 30 self.feature_fns = []
31
32 -def calculate(me, observation):
33 """calculate(me, observation) -> list of log probs 34 35 Calculate the log of the probability for each class. me is a 36 MaxEntropy object that has been trained. observation is a vector 37 representing the observed data. The return value is a list of 38 unnormalized log probabilities for each class. 39 40 """ 41 scores = [] 42 assert len(me.feature_fns) == len(me.alphas) 43 for klass in me.classes: 44 lprob = 0.0 45 for fn, alpha in zip(me.feature_fns, me.alphas): 46 lprob += fn(observation, klass) * alpha 47 scores.append(lprob) 48 return scores
49
50 -def classify(me, observation):
51 """classify(me, observation) -> class 52 53 Classify an observation into a class. 54 55 """ 56 scores = calculate(me, observation) 57 max_score, klass = scores[0], me.classes[0] 58 for i in range(1, len(scores)): 59 if scores[i] > max_score: 60 max_score, klass = scores[i], me.classes[i] 61 return klass
62
63 -def _eval_feature_fn(fn, xs, classes):
64 """_eval_feature_fn(fn, xs, classes) -> dict of values 65 66 Evaluate a feature function on every instance of the training set 67 and class. fn is a callback function that takes two parameters: a 68 training instance and a class. Return a dictionary of (training 69 set index, class index) -> non-zero value. Values of 0 are not 70 stored in the dictionary. 71 72 """ 73 values = {} 74 for i in range(len(xs)): 75 for j in range(len(classes)): 76 f = fn(xs[i], classes[j]) 77 if f != 0: 78 values[(i, j)] = f 79 return values
80
81 -def _calc_empirical_expects(xs, ys, classes, features):
82 """_calc_empirical_expects(xs, ys, classes, features) -> list of expectations 83 84 Calculate the expectation of each function from the data. This is 85 the constraint for the maximum entropy distribution. Return a 86 list of expectations, parallel to the list of features. 87 88 """ 89 # E[f_i] = SUM_x,y P(x, y) f(x, y) 90 # = 1/N f(x, y) 91 class2index = {} 92 for index, key in enumerate(classes): 93 class2index[key] = index 94 ys_i = [class2index[y] for y in ys] 95 96 expect = [] 97 N = len(xs) 98 for feature in features: 99 s = 0 100 for i in range(N): 101 s += feature.get((i, ys_i[i]), 0) 102 expect.append(float(s) / N) 103 return expect
104
105 -def _calc_model_expects(xs, classes, features, alphas):
106 """_calc_model_expects(xs, classes, features, alphas) -> list of expectations. 107 108 Calculate the expectation of each feature from the model. This is 109 not used in maximum entropy training, but provides a good function 110 for debugging. 111 112 """ 113 # SUM_X P(x) SUM_Y P(Y|X) F(X, Y) 114 # = 1/N SUM_X SUM_Y P(Y|X) F(X, Y) 115 p_yx = _calc_p_class_given_x(xs, classes, features, alphas) 116 117 expects = [] 118 for feature in features: 119 sum = 0.0 120 for (i, j), f in feature.iteritems(): 121 sum += p_yx[i][j] * f 122 expects.append(sum/len(xs)) 123 return expects
124
125 -def _calc_p_class_given_x(xs, classes, features, alphas):
126 """_calc_p_class_given_x(xs, classes, features, alphas) -> matrix 127 128 Calculate P(y|x), where y is the class and x is an instance from 129 the training set. Return a XSxCLASSES matrix of probabilities. 130 131 """ 132 prob_yx = numpy.zeros((len(xs), len(classes))) 133 134 # Calculate log P(y, x). 135 assert len(features) == len(alphas) 136 for feature, alpha in zip(features, alphas): 137 for (x, y), f in feature.iteritems(): 138 prob_yx[x][y] += alpha * f 139 # Take an exponent to get P(y, x) 140 prob_yx = numpy.exp(prob_yx) 141 # Divide out the probability over each class, so we get P(y|x). 142 for i in range(len(xs)): 143 z = sum(prob_yx[i]) 144 prob_yx[i] = prob_yx[i] / z 145 146 #prob_yx = [] 147 #for i in range(len(xs)): 148 # z = 0.0 # Normalization factor for this x, over all classes. 149 # probs = [0.0] * len(classes) 150 # for j in range(len(classes)): 151 # log_p = 0.0 # log of the probability of f(x, y) 152 # for k in range(len(features)): 153 # log_p += alphas[k] * features[k].get((i, j), 0.0) 154 # probs[j] = numpy.exp(log_p) 155 # z += probs[j] 156 # # Normalize the probabilities for this x. 157 # probs = map(lambda x, z=z: x/z, probs) 158 # prob_yx.append(probs) 159 return prob_yx
160
161 -def _calc_f_sharp(N, nclasses, features):
162 """_calc_f_sharp(N, nclasses, features) -> matrix of f sharp values.""" 163 # f#(x, y) = SUM_i feature(x, y) 164 f_sharp = numpy.zeros((N, nclasses)) 165 for feature in features: 166 for (i, j), f in feature.iteritems(): 167 f_sharp[i][j] += f 168 return f_sharp
169
170 -def _iis_solve_delta(N, feature, f_sharp, empirical, prob_yx, 171 max_newton_iterations, newton_converge):
172 # Solve delta using Newton's method for: 173 # SUM_x P(x) * SUM_c P(c|x) f_i(x, c) e^[delta_i * f#(x, c)] = 0 174 delta = 0.0 175 iters = 0 176 while iters < max_newton_iterations: # iterate for Newton's method 177 f_newton = df_newton = 0.0 # evaluate the function and derivative 178 for (i, j), f in feature.iteritems(): 179 prod = prob_yx[i][j] * f * numpy.exp(delta * f_sharp[i][j]) 180 f_newton += prod 181 df_newton += prod * f_sharp[i][j] 182 f_newton, df_newton = empirical - f_newton / N, -df_newton / N 183 184 ratio = f_newton / df_newton 185 delta -= ratio 186 if numpy.fabs(ratio) < newton_converge: # converged 187 break 188 iters = iters + 1 189 else: 190 raise RuntimeError("Newton's method did not converge") 191 return delta
192
193 -def _train_iis(xs, classes, features, f_sharp, alphas, e_empirical, 194 max_newton_iterations, newton_converge):
195 """Do one iteration of hill climbing to find better alphas (PRIVATE).""" 196 # This is a good function to parallelize. 197 198 # Pre-calculate P(y|x) 199 p_yx = _calc_p_class_given_x(xs, classes, features, alphas) 200 201 N = len(xs) 202 newalphas = alphas[:] 203 for i in range(len(alphas)): 204 delta = _iis_solve_delta(N, features[i], f_sharp, e_empirical[i], p_yx, 205 max_newton_iterations, newton_converge) 206 newalphas[i] += delta 207 return newalphas
208 209
210 -def train(training_set, results, feature_fns, update_fn=None, 211 max_iis_iterations=10000, iis_converge=1.0e-5, 212 max_newton_iterations=100, newton_converge=1.0e-10):
213 """Train a maximum entropy classifier, returns MaxEntropy object. 214 215 Train a maximum entropy classifier on a training set. 216 training_set is a list of observations. results is a list of the 217 class assignments for each observation. feature_fns is a list of 218 the features. These are callback functions that take an 219 observation and class and return a 1 or 0. update_fn is a 220 callback function that is called at each training iteration. It is 221 passed a MaxEntropy object that encapsulates the current state of 222 the training. 223 224 The maximum number of iterations and the convergence criterion for IIS 225 are given by max_iis_iterations and iis_converge, respectively, while 226 max_newton_iterations and newton_converge are the maximum number 227 of iterations and the convergence criterion for Newton's method. 228 """ 229 if not training_set: 230 raise ValueError("No data in the training set.") 231 if len(training_set) != len(results): 232 raise ValueError("training_set and results should be parallel lists.") 233 234 # Rename variables for convenience. 235 xs, ys = training_set, results 236 237 # Get a list of all the classes that need to be trained. 238 classes = sorted(set(results)) 239 240 # Cache values for all features. 241 features = [_eval_feature_fn(fn, training_set, classes) 242 for fn in feature_fns] 243 # Cache values for f#. 244 f_sharp = _calc_f_sharp(len(training_set), len(classes), features) 245 246 # Pre-calculate the empirical expectations of the features. 247 e_empirical = _calc_empirical_expects(xs, ys, classes, features) 248 249 # Now train the alpha parameters to weigh each feature. 250 alphas = [0.0] * len(features) 251 iters = 0 252 while iters < max_iis_iterations: 253 nalphas = _train_iis(xs, classes, features, f_sharp, 254 alphas, e_empirical, 255 max_newton_iterations, newton_converge) 256 diff = map(lambda x, y: numpy.fabs(x-y), alphas, nalphas) 257 diff = reduce(lambda x, y: x+y, diff, 0) 258 alphas = nalphas 259 260 me = MaxEntropy() 261 me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns 262 if update_fn is not None: 263 update_fn(me) 264 265 if diff < iis_converge: # converged 266 break 267 else: 268 raise RuntimeError("IIS did not converge") 269 270 return me
271 272 273 if __name__ == "__main__": 274 #Car data from example Naive Bayes Classifier example by Eric Meisner November 22, 2003 275 #http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf 276 277 xcar=[ 278 ['Red', 'Sports', 'Domestic'], 279 ['Red', 'Sports', 'Domestic'], 280 ['Red', 'Sports', 'Domestic'], 281 ['Yellow', 'Sports', 'Domestic'], 282 ['Yellow', 'Sports', 'Imported'], 283 ['Yellow', 'SUV', 'Imported'], 284 ['Yellow', 'SUV', 'Imported'], 285 ['Yellow', 'SUV', 'Domestic'], 286 ['Red', 'SUV', 'Imported'], 287 ['Red', 'Sports', 'Imported'] 288 ] 289 290 ycar=[ 291 'Yes', 292 'No', 293 'Yes', 294 'No', 295 'Yes', 296 'No', 297 'Yes', 298 'No', 299 'No', 300 'Yes' 301 ] 302 303 #Requires some rules or features
304 - def udf1(ts, cl):
305 if ts[0] =='Red': 306 return 0 307 else: 308 return 1
309
310 - def udf2(ts, cl):
311 if ts[1] =='Sports': 312 return 0 313 else: 314 return 1
315
316 - def udf3(ts, cl):
317 if ts[2] =='Domestic': 318 return 0 319 else: 320 return 1
321 322 user_functions=[udf1, udf2, udf3] # must be an iterable type 323 324 xe=train(xcar, ycar, user_functions) 325 for xv,yv in zip(xcar, ycar): 326 xc=classify(xe, xv) 327 print 'Pred:', xv, 'gives', xc, 'y is', yv 328