Package Bio :: Module NaiveBayes
[hide private]
[frames] | no frames]

Source Code for Module Bio.NaiveBayes

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This provides code for a general Naive Bayes learner. 
  7   
  8  Naive Bayes is a supervised classification algorithm that uses Bayes 
  9  rule to compute the fit between a new observation and some previously 
 10  observed data.  The observations are discrete feature vectors, with 
 11  the Bayes assumption that the features are independent.  Although this 
 12  is hardly ever true, the classifier works well enough in practice. 
 13   
 14  Glossary: 
 15  observation    A feature vector of discrete data. 
 16  class          A possible classification for an observation. 
 17   
 18   
 19  Classes: 
 20  NaiveBayes     Holds information for a naive Bayes classifier. 
 21   
 22  Functions: 
 23  train          Train a new naive Bayes classifier. 
 24  calculate      Calculate the probabilities of each class, given an observation. 
 25  classify       Classify an observation into a class. 
 26   
 27  """ 
 28   
 29  import numpy 
 30   
31 -def _contents(items):
32 term = 1.0/len(items) 33 counts = {} 34 for item in items: 35 counts[item] = counts.get(item,0) + term 36 return counts
37
38 -class NaiveBayes(object):
39 """Holds information for a NaiveBayes classifier. 40 41 Members: 42 classes List of the possible classes of data. 43 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim) 44 p_prior List of the prior probabilities for every class. 45 dimensionality Dimensionality of the data. 46 47 """
48 - def __init__(self):
49 self.classes = [] 50 self.p_conditional = None 51 self.p_prior = [] 52 self.dimensionality = None
53
54 -def calculate(nb, observation, scale=0):
55 """calculate(nb, observation[, scale]) -> probability dict 56 57 Calculate log P(class|observation) for each class. nb is a NaiveBayes 58 classifier that has been trained. observation is a list representing 59 the observed data. scale is whether the probability should be 60 scaled by P(observation). By default, no scaling is done. The return 61 value is a dictionary where the keys is the class and the value is the 62 log probability of the class. 63 64 """ 65 # P(class|observation) = P(observation|class)*P(class)/P(observation) 66 # Taking the log: 67 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) 68 69 # Make sure the observation has the right dimensionality. 70 if len(observation) != nb.dimensionality: 71 raise ValueError("observation in %d dimension, but classifier in %d" \ 72 % (len(observation), nb.dimensionality)) 73 74 # Calculate log P(observation|class) for every class. 75 n = len(nb.classes) 76 lp_observation_class = numpy.zeros(n) # array of log P(observation|class) 77 for i in range(n): 78 # log P(observation|class) = SUM_i log P(observation_i|class) 79 probs = [None] * len(observation) 80 for j in range(len(observation)): 81 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) 82 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300)) 83 lp_observation_class[i] = sum(lprobs) 84 85 # Calculate log P(class). 86 lp_prior = numpy.log(nb.p_prior) 87 88 # Calculate log P(observation). 89 lp_observation = 0.0 # P(observation) 90 if scale: # Only calculate this if requested. 91 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) 92 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class,-700,+700)) 93 lp_observation = numpy.log(sum(obs)) 94 95 # Calculate log P(class|observation). 96 lp_class_observation = {} # Dict of class : log P(class|observation) 97 for i in range(len(nb.classes)): 98 lp_class_observation[nb.classes[i]] = \ 99 lp_observation_class[i] + lp_prior[i] - lp_observation 100 101 return lp_class_observation
102
103 -def classify(nb, observation):
104 """classify(nb, observation) -> class 105 106 Classify an observation into a class. 107 108 """ 109 # The class is the one with the highest probability. 110 probs = calculate(nb, observation, scale=0) 111 max_prob = max_class = None 112 for klass in nb.classes: 113 if max_prob is None or probs[klass] > max_prob: 114 max_prob, max_class = probs[klass], klass 115 return max_class
116
117 -def train(training_set, results, priors=None, typecode=None):
118 """train(training_set, results[, priors]) -> NaiveBayes 119 120 Train a naive bayes classifier on a training set. training_set is a 121 list of observations. results is a list of the class assignments 122 for each observation. Thus, training_set and results must be the same 123 length. priors is an optional dictionary specifying the prior 124 probabilities for each type of result. If not specified, the priors 125 will be estimated from the training results. 126 127 """ 128 if not len(training_set): 129 raise ValueError("No data in the training set.") 130 if len(training_set) != len(results): 131 raise ValueError("training_set and results should be parallel lists.") 132 133 # If no typecode is specified, try to pick a reasonable one. If 134 # training_set is a Numeric array, then use that typecode. 135 # Otherwise, choose a reasonable default. 136 # XXX NOT IMPLEMENTED 137 138 # Check to make sure each vector in the training set has the same 139 # dimensionality. 140 dimensions = [len(x) for x in training_set] 141 if min(dimensions) != max(dimensions): 142 raise ValueError("observations have different dimensionality") 143 144 nb = NaiveBayes() 145 nb.dimensionality = dimensions[0] 146 147 # Get a list of all the classes, and 148 # estimate the prior probabilities for the classes. 149 if priors is not None: 150 percs = priors 151 nb.classes = list(set(results)) 152 else: 153 class_freq = _contents(results) 154 nb.classes = class_freq.keys() 155 percs = class_freq 156 nb.classes.sort() # keep it tidy 157 158 nb.p_prior = numpy.zeros(len(nb.classes)) 159 for i in range(len(nb.classes)): 160 nb.p_prior[i] = percs[nb.classes[i]] 161 162 # Collect all the observations in class. For each class, make a 163 # matrix of training instances versus dimensions. I might be able 164 # to optimize this with Numeric, if the training_set parameter 165 # were guaranteed to be a matrix. However, this may not be the 166 # case, because the client may be hacking up a sparse matrix or 167 # something. 168 c2i = {} # class to index of class 169 for index, key in enumerate(nb.classes): 170 c2i[key] = index 171 observations = [[] for c in nb.classes] # separate observations by class 172 for i in range(len(results)): 173 klass, obs = results[i], training_set[i] 174 observations[c2i[klass]].append(obs) 175 # Now make the observations Numeric matrics. 176 for i in range(len(observations)): 177 # XXX typecode must be specified! 178 observations[i] = numpy.asarray(observations[i], typecode) 179 180 # Calculate P(value|class,dim) for every class. 181 # This is a good loop to optimize. 182 nb.p_conditional = [] 183 for i in range(len(nb.classes)): 184 class_observations = observations[i] # observations for this class 185 nb.p_conditional.append([None] * nb.dimensionality) 186 for j in range(nb.dimensionality): 187 # Collect all the values in this dimension. 188 values = class_observations[:, j] 189 190 # Add pseudocounts here. This needs to be parameterized. 191 #values = list(values) + range(len(nb.classes)) # XXX add 1 192 193 # Estimate P(value|class,dim) 194 nb.p_conditional[i][j] = _contents(values) 195 return nb
196 197 if __name__ == "__main__": 198 # Car data from example 'Naive Bayes Classifier example' by Eric Meisner November 22, 2003 199 # http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf 200 xcar=[ 201 ['Red', 'Sports', 'Domestic'], 202 ['Red', 'Sports', 'Domestic'], 203 ['Red', 'Sports', 'Domestic'], 204 ['Yellow', 'Sports', 'Domestic'], 205 ['Yellow', 'Sports', 'Imported'], 206 ['Yellow', 'SUV', 'Imported'], 207 ['Yellow', 'SUV', 'Imported'], 208 ['Yellow', 'SUV', 'Domestic'], 209 ['Red', 'SUV', 'Imported'], 210 ['Red', 'Sports', 'Imported'] 211 ] 212 213 ycar=[ 214 'Yes', 215 'No', 216 'Yes', 217 'No', 218 'Yes', 219 'No', 220 'Yes', 221 'No', 222 'No', 223 'Yes' 224 ] 225 226 carmodel = train(xcar, ycar) 227 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic']) 228 print 'Is Yes?', carresult 229 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic']) 230 print 'Is No?', carresult 231