Package Bio :: Package HMM :: Module Trainer
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.Trainer

  1  """Provide trainers which estimate parameters based on training sequences. 
  2   
  3  These should be used to 'train' a Markov Model prior to actually using 
  4  it to decode state paths. When supplied training sequences and a model 
  5  to work from, these classes will estimate paramters of the model. 
  6   
  7  This aims to estimate two parameters: 
  8   
  9  * a_{kl} -- the number of times there is a transition from k to l in the 
 10  training data. 
 11   
 12  * e_{k}(b) -- the number of emissions of the state b from the letter k 
 13  in the training data. 
 14  """ 
 15  # standard modules 
 16  import math 
 17   
 18  # local stuff 
 19  from DynamicProgramming import ScaledDPAlgorithms 
 20   
21 -class TrainingSequence(object):
22 """Hold a training sequence with emissions and optionally, a state path. 23 """
24 - def __init__(self, emissions, state_path):
25 """Initialize a training sequence. 26 27 Arguments: 28 29 o emissions - A Seq object containing the sequence of emissions in 30 the training sequence, and the alphabet of the sequence. 31 32 o state_path - A Seq object containing the sequence of states and 33 the alphabet of the states. If there is no known state path, then 34 the sequence of states should be an empty string. 35 """ 36 if len(state_path) > 0: 37 assert len(emissions) == len(state_path), \ 38 "State path does not match associated emissions." 39 self.emissions = emissions 40 self.states = state_path
41
42 -class AbstractTrainer(object):
43 """Provide generic functionality needed in all trainers. 44 """
45 - def __init__(self, markov_model):
46 self._markov_model = markov_model
47
48 - def log_likelihood(self, probabilities):
49 """Calculate the log likelihood of the training seqs. 50 51 Arguments: 52 53 o probabilities -- A list of the probabilities of each training 54 sequence under the current paramters, calculated using the forward 55 algorithm. 56 """ 57 total_likelihood = 0 58 for probability in probabilities: 59 total_likelihood += math.log(probability) 60 61 return total_likelihood
62
63 - def estimate_params(self, transition_counts, emission_counts):
64 """Get a maximum likelihood estimation of transition and emmission. 65 66 Arguments: 67 68 o transition_counts -- A dictionary with the total number of counts 69 of transitions between two states. 70 71 o emissions_counts -- A dictionary with the total number of counts 72 of emmissions of a particular emission letter by a state letter. 73 74 This then returns the maximum likelihood estimators for the 75 transitions and emissions, estimated by formulas 3.18 in 76 Durbin et al: 77 78 a_{kl} = A_{kl} / sum(A_{kl'}) 79 e_{k}(b) = E_{k}(b) / sum(E_{k}(b')) 80 81 Returns: 82 Transition and emission dictionaries containing the maximum 83 likelihood estimators. 84 """ 85 # now calculate the information 86 ml_transitions = self.ml_estimator(transition_counts) 87 ml_emissions = self.ml_estimator(emission_counts) 88 89 return ml_transitions, ml_emissions
90
91 - def ml_estimator(self, counts):
92 """Calculate the maximum likelihood estimator. 93 94 This can calculate maximum likelihoods for both transitions 95 and emissions. 96 97 Arguments: 98 99 o counts -- A dictionary of the counts for each item. 100 101 See estimate_params for a description of the formula used for 102 calculation. 103 """ 104 # get an ordered list of all items 105 all_ordered = counts.keys() 106 all_ordered.sort() 107 108 ml_estimation = {} 109 110 # the total counts for the current letter we are on 111 cur_letter = None 112 cur_letter_counts = 0 113 114 for cur_item in all_ordered: 115 # if we are on a new letter (ie. the first letter of the tuple) 116 if cur_item[0] != cur_letter: 117 # set the new letter we are working with 118 cur_letter = cur_item[0] 119 120 # count up the total counts for this letter 121 cur_letter_counts = counts[cur_item] 122 123 # add counts for all other items with the same first letter 124 cur_position = all_ordered.index(cur_item) + 1 125 126 # keep adding while we have the same first letter or until 127 # we get to the end of the ordered list 128 while (cur_position < len(all_ordered) and 129 all_ordered[cur_position][0] == cur_item[0]): 130 cur_letter_counts += counts[all_ordered[cur_position]] 131 cur_position += 1 132 # otherwise we've already got the total counts for this letter 133 else: 134 pass 135 136 # now calculate the ml and add it to the estimation 137 cur_ml = float(counts[cur_item]) / float(cur_letter_counts) 138 ml_estimation[cur_item] = cur_ml 139 140 return ml_estimation
141
142 -class BaumWelchTrainer(AbstractTrainer):
143 """Trainer that uses the Baum-Welch algorithm to estimate parameters. 144 145 These should be used when a training sequence for an HMM has unknown 146 paths for the actual states, and you need to make an estimation of the 147 model parameters from the observed emissions. 148 149 This uses the Baum-Welch algorithm, first described in 150 Baum, L.E. 1972. Inequalities. 3:1-8 151 This is based on the description in 'Biological Sequence Analysis' by 152 Durbin et al. in section 3.3 153 154 This algorithm is guaranteed to converge to a local maximum, but not 155 necessarily to the global maxima, so use with care! 156 """
157 - def __init__(self, markov_model):
158 """Initialize the trainer. 159 160 Arguments: 161 162 o markov_model - The model we are going to estimate parameters for. 163 This should have the parameters with some initial estimates, that 164 we can build from. 165 """ 166 AbstractTrainer.__init__(self, markov_model)
167
168 - def train(self, training_seqs, stopping_criteria, 169 dp_method = ScaledDPAlgorithms):
170 """Estimate the parameters using training sequences. 171 172 The algorithm for this is taken from Durbin et al. p64, so this 173 is a good place to go for a reference on what is going on. 174 175 Arguments: 176 177 o training_seqs -- A list of TrainingSequence objects to be used 178 for estimating the parameters. 179 180 o stopping_criteria -- A function, that when passed the change 181 in log likelihood and threshold, will indicate if we should stop 182 the estimation iterations. 183 184 o dp_method -- A class instance specifying the dynamic programming 185 implementation we should use to calculate the forward and 186 backward variables. By default, we use the scaling method. 187 """ 188 prev_log_likelihood = None 189 num_iterations = 1 190 191 while 1: 192 transition_count = self._markov_model.get_blank_transitions() 193 emission_count = self._markov_model.get_blank_emissions() 194 195 # remember all of the sequence probabilities 196 all_probabilities = [] 197 198 for training_seq in training_seqs: 199 # calculate the forward and backward variables 200 DP = dp_method(self._markov_model, training_seq) 201 forward_var, seq_prob = DP.forward_algorithm() 202 backward_var = DP.backward_algorithm() 203 204 all_probabilities.append(seq_prob) 205 206 # update the counts for transitions and emissions 207 transition_count = self.update_transitions(transition_count, 208 training_seq, 209 forward_var, 210 backward_var, 211 seq_prob) 212 emission_count = self.update_emissions(emission_count, 213 training_seq, 214 forward_var, 215 backward_var, 216 seq_prob) 217 218 # update the markov model with the new probabilities 219 ml_transitions, ml_emissions = \ 220 self.estimate_params(transition_count, emission_count) 221 self._markov_model.transition_prob = ml_transitions 222 self._markov_model.emission_prob = ml_emissions 223 224 cur_log_likelihood = self.log_likelihood(all_probabilities) 225 226 # if we have previously calculated the log likelihood (ie. 227 # not the first round), see if we can finish 228 if prev_log_likelihood is not None: 229 # XXX log likelihoods are negatives -- am I calculating 230 # the change properly, or should I use the negatives... 231 # I'm not sure at all if this is right. 232 log_likelihood_change = abs(abs(cur_log_likelihood) - 233 abs(prev_log_likelihood)) 234 235 # check whether we have completed enough iterations to have 236 # a good estimation 237 if stopping_criteria(log_likelihood_change, num_iterations): 238 break 239 240 # set up for another round of iterations 241 prev_log_likelihood = cur_log_likelihood 242 num_iterations += 1 243 244 return self._markov_model
245
246 - def update_transitions(self, transition_counts, training_seq, 247 forward_vars, backward_vars, training_seq_prob):
248 """Add the contribution of a new training sequence to the transitions. 249 250 Arguments: 251 252 o transition_counts -- A dictionary of the current counts for the 253 transitions 254 255 o training_seq -- The training sequence we are working with 256 257 o forward_vars -- Probabilities calculated using the forward 258 algorithm. 259 260 o backward_vars -- Probabilities calculated using the backwards 261 algorithm. 262 263 o training_seq_prob - The probability of the current sequence. 264 265 This calculates A_{kl} (the estimated transition counts from state 266 k to state l) using formula 3.20 in Durbin et al. 267 """ 268 # set up the transition and emission probabilities we are using 269 transitions = self._markov_model.transition_prob 270 emissions = self._markov_model.emission_prob 271 272 # loop over the possible combinations of state path letters 273 for k in training_seq.states.alphabet.letters: 274 for l in self._markov_model.transitions_from(k): 275 estimated_counts = 0 276 # now loop over the entire training sequence 277 for i in range(len(training_seq.emissions) - 1): 278 # the forward value of k at the current position 279 forward_value = forward_vars[(k, i)] 280 281 # the backward value of l in the next position 282 backward_value = backward_vars[(l, i + 1)] 283 284 # the probability of a transition from k to l 285 trans_value = transitions[(k, l)] 286 287 # the probability of getting the emission at the next pos 288 emm_value = emissions[(l, training_seq.emissions[i + 1])] 289 290 estimated_counts += (forward_value * trans_value * 291 emm_value * backward_value) 292 293 # update the transition approximation 294 transition_counts[(k, l)] += (float(estimated_counts) / 295 training_seq_prob) 296 297 return transition_counts
298
299 - def update_emissions(self, emission_counts, training_seq, 300 forward_vars, backward_vars, training_seq_prob):
301 """Add the contribution of a new training sequence to the emissions 302 303 Arguments: 304 305 o emission_counts -- A dictionary of the current counts for the 306 emissions 307 308 o training_seq -- The training sequence we are working with 309 310 o forward_vars -- Probabilities calculated using the forward 311 algorithm. 312 313 o backward_vars -- Probabilities calculated using the backwards 314 algorithm. 315 316 o training_seq_prob - The probability of the current sequence. 317 318 This calculates E_{k}(b) (the estimated emission probability for 319 emission letter b from state k) using formula 3.21 in Durbin et al. 320 """ 321 # loop over the possible combinations of state path letters 322 for k in training_seq.states.alphabet.letters: 323 # now loop over all of the possible emissions 324 for b in training_seq.emissions.alphabet.letters: 325 expected_times = 0 326 # finally loop over the entire training sequence 327 for i in range(len(training_seq.emissions)): 328 # only count the forward and backward probability if the 329 # emission at the position is the same as b 330 if training_seq.emissions[i] == b: 331 # f_{k}(i) b_{k}(i) 332 expected_times += (forward_vars[(k, i)] * 333 backward_vars[(k, i)]) 334 335 # add to E_{k}(b) 336 emission_counts[(k, b)] += (float(expected_times) / 337 training_seq_prob) 338 339 return emission_counts
340
341 -class KnownStateTrainer(AbstractTrainer):
342 """Estimate probabilities with known state sequences. 343 344 This should be used for direct estimation of emission and transition 345 probabilities when both the state path and emission sequence are 346 known for the training examples. 347 """
348 - def __init__(self, markov_model):
349 AbstractTrainer.__init__(self, markov_model)
350
351 - def train(self, training_seqs):
352 """Estimate the Markov Model parameters with known state paths. 353 354 This trainer requires that both the state and the emissions are 355 known for all of the training sequences in the list of 356 TrainingSequence objects. 357 This training will then count all of the transitions and emissions, 358 and use this to estimate the parameters of the model. 359 """ 360 # count up all of the transitions and emissions 361 transition_counts = self._markov_model.get_blank_transitions() 362 emission_counts = self._markov_model.get_blank_emissions() 363 364 for training_seq in training_seqs: 365 emission_counts = self._count_emissions(training_seq, 366 emission_counts) 367 transition_counts = self._count_transitions(training_seq.states, 368 transition_counts) 369 370 # update the markov model from the counts 371 ml_transitions, ml_emissions = \ 372 self.estimate_params(transition_counts, 373 emission_counts) 374 self._markov_model.transition_prob = ml_transitions 375 self._markov_model.emission_prob = ml_emissions 376 377 return self._markov_model
378
379 - def _count_emissions(self, training_seq, emission_counts):
380 """Add emissions from the training sequence to the current counts. 381 382 Arguments: 383 384 o training_seq -- A TrainingSequence with states and emissions 385 to get the counts from 386 387 o emission_counts -- The current emission counts to add to. 388 """ 389 for index in range(len(training_seq.emissions)): 390 cur_state = training_seq.states[index] 391 cur_emission = training_seq.emissions[index] 392 393 try: 394 emission_counts[(cur_state, cur_emission)] += 1 395 except KeyError: 396 raise KeyError("Unexpected emission (%s, %s)" 397 % (cur_state, cur_emission)) 398 return emission_counts
399
400 - def _count_transitions(self, state_seq, transition_counts):
401 """Add transitions from the training sequence to the current counts. 402 403 Arguments: 404 405 o state_seq -- A Seq object with the states of the current training 406 sequence. 407 408 o transition_counts -- The current transition counts to add to. 409 """ 410 for cur_pos in range(len(state_seq) - 1): 411 cur_state = state_seq[cur_pos] 412 next_state = state_seq[cur_pos + 1] 413 414 try: 415 transition_counts[(cur_state, next_state)] += 1 416 except KeyError: 417 raise KeyError("Unexpected transition (%s, %s)" % 418 (cur_state, next_state)) 419 420 return transition_counts
421