1 """Provide trainers which estimate parameters based on training sequences.
2
3 These should be used to 'train' a Markov Model prior to actually using
4 it to decode state paths. When supplied training sequences and a model
5 to work from, these classes will estimate paramters of the model.
6
7 This aims to estimate two parameters:
8
9 * a_{kl} -- the number of times there is a transition from k to l in the
10 training data.
11
12 * e_{k}(b) -- the number of emissions of the state b from the letter k
13 in the training data.
14 """
15
16 import math
17
18
19 from DynamicProgramming import ScaledDPAlgorithms
20
22 """Hold a training sequence with emissions and optionally, a state path.
23 """
24 - def __init__(self, emissions, state_path):
25 """Initialize a training sequence.
26
27 Arguments:
28
29 o emissions - A Seq object containing the sequence of emissions in
30 the training sequence, and the alphabet of the sequence.
31
32 o state_path - A Seq object containing the sequence of states and
33 the alphabet of the states. If there is no known state path, then
34 the sequence of states should be an empty string.
35 """
36 if len(state_path) > 0:
37 assert len(emissions) == len(state_path), \
38 "State path does not match associated emissions."
39 self.emissions = emissions
40 self.states = state_path
41
43 """Provide generic functionality needed in all trainers.
44 """
46 self._markov_model = markov_model
47
49 """Calculate the log likelihood of the training seqs.
50
51 Arguments:
52
53 o probabilities -- A list of the probabilities of each training
54 sequence under the current paramters, calculated using the forward
55 algorithm.
56 """
57 total_likelihood = 0
58 for probability in probabilities:
59 total_likelihood += math.log(probability)
60
61 return total_likelihood
62
64 """Get a maximum likelihood estimation of transition and emmission.
65
66 Arguments:
67
68 o transition_counts -- A dictionary with the total number of counts
69 of transitions between two states.
70
71 o emissions_counts -- A dictionary with the total number of counts
72 of emmissions of a particular emission letter by a state letter.
73
74 This then returns the maximum likelihood estimators for the
75 transitions and emissions, estimated by formulas 3.18 in
76 Durbin et al:
77
78 a_{kl} = A_{kl} / sum(A_{kl'})
79 e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
80
81 Returns:
82 Transition and emission dictionaries containing the maximum
83 likelihood estimators.
84 """
85
86 ml_transitions = self.ml_estimator(transition_counts)
87 ml_emissions = self.ml_estimator(emission_counts)
88
89 return ml_transitions, ml_emissions
90
92 """Calculate the maximum likelihood estimator.
93
94 This can calculate maximum likelihoods for both transitions
95 and emissions.
96
97 Arguments:
98
99 o counts -- A dictionary of the counts for each item.
100
101 See estimate_params for a description of the formula used for
102 calculation.
103 """
104
105 all_ordered = counts.keys()
106 all_ordered.sort()
107
108 ml_estimation = {}
109
110
111 cur_letter = None
112 cur_letter_counts = 0
113
114 for cur_item in all_ordered:
115
116 if cur_item[0] != cur_letter:
117
118 cur_letter = cur_item[0]
119
120
121 cur_letter_counts = counts[cur_item]
122
123
124 cur_position = all_ordered.index(cur_item) + 1
125
126
127
128 while (cur_position < len(all_ordered) and
129 all_ordered[cur_position][0] == cur_item[0]):
130 cur_letter_counts += counts[all_ordered[cur_position]]
131 cur_position += 1
132
133 else:
134 pass
135
136
137 cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
138 ml_estimation[cur_item] = cur_ml
139
140 return ml_estimation
141
143 """Trainer that uses the Baum-Welch algorithm to estimate parameters.
144
145 These should be used when a training sequence for an HMM has unknown
146 paths for the actual states, and you need to make an estimation of the
147 model parameters from the observed emissions.
148
149 This uses the Baum-Welch algorithm, first described in
150 Baum, L.E. 1972. Inequalities. 3:1-8
151 This is based on the description in 'Biological Sequence Analysis' by
152 Durbin et al. in section 3.3
153
154 This algorithm is guaranteed to converge to a local maximum, but not
155 necessarily to the global maxima, so use with care!
156 """
158 """Initialize the trainer.
159
160 Arguments:
161
162 o markov_model - The model we are going to estimate parameters for.
163 This should have the parameters with some initial estimates, that
164 we can build from.
165 """
166 AbstractTrainer.__init__(self, markov_model)
167
170 """Estimate the parameters using training sequences.
171
172 The algorithm for this is taken from Durbin et al. p64, so this
173 is a good place to go for a reference on what is going on.
174
175 Arguments:
176
177 o training_seqs -- A list of TrainingSequence objects to be used
178 for estimating the parameters.
179
180 o stopping_criteria -- A function, that when passed the change
181 in log likelihood and threshold, will indicate if we should stop
182 the estimation iterations.
183
184 o dp_method -- A class instance specifying the dynamic programming
185 implementation we should use to calculate the forward and
186 backward variables. By default, we use the scaling method.
187 """
188 prev_log_likelihood = None
189 num_iterations = 1
190
191 while 1:
192 transition_count = self._markov_model.get_blank_transitions()
193 emission_count = self._markov_model.get_blank_emissions()
194
195
196 all_probabilities = []
197
198 for training_seq in training_seqs:
199
200 DP = dp_method(self._markov_model, training_seq)
201 forward_var, seq_prob = DP.forward_algorithm()
202 backward_var = DP.backward_algorithm()
203
204 all_probabilities.append(seq_prob)
205
206
207 transition_count = self.update_transitions(transition_count,
208 training_seq,
209 forward_var,
210 backward_var,
211 seq_prob)
212 emission_count = self.update_emissions(emission_count,
213 training_seq,
214 forward_var,
215 backward_var,
216 seq_prob)
217
218
219 ml_transitions, ml_emissions = \
220 self.estimate_params(transition_count, emission_count)
221 self._markov_model.transition_prob = ml_transitions
222 self._markov_model.emission_prob = ml_emissions
223
224 cur_log_likelihood = self.log_likelihood(all_probabilities)
225
226
227
228 if prev_log_likelihood is not None:
229
230
231
232 log_likelihood_change = abs(abs(cur_log_likelihood) -
233 abs(prev_log_likelihood))
234
235
236
237 if stopping_criteria(log_likelihood_change, num_iterations):
238 break
239
240
241 prev_log_likelihood = cur_log_likelihood
242 num_iterations += 1
243
244 return self._markov_model
245
246 - def update_transitions(self, transition_counts, training_seq,
247 forward_vars, backward_vars, training_seq_prob):
248 """Add the contribution of a new training sequence to the transitions.
249
250 Arguments:
251
252 o transition_counts -- A dictionary of the current counts for the
253 transitions
254
255 o training_seq -- The training sequence we are working with
256
257 o forward_vars -- Probabilities calculated using the forward
258 algorithm.
259
260 o backward_vars -- Probabilities calculated using the backwards
261 algorithm.
262
263 o training_seq_prob - The probability of the current sequence.
264
265 This calculates A_{kl} (the estimated transition counts from state
266 k to state l) using formula 3.20 in Durbin et al.
267 """
268
269 transitions = self._markov_model.transition_prob
270 emissions = self._markov_model.emission_prob
271
272
273 for k in training_seq.states.alphabet.letters:
274 for l in self._markov_model.transitions_from(k):
275 estimated_counts = 0
276
277 for i in range(len(training_seq.emissions) - 1):
278
279 forward_value = forward_vars[(k, i)]
280
281
282 backward_value = backward_vars[(l, i + 1)]
283
284
285 trans_value = transitions[(k, l)]
286
287
288 emm_value = emissions[(l, training_seq.emissions[i + 1])]
289
290 estimated_counts += (forward_value * trans_value *
291 emm_value * backward_value)
292
293
294 transition_counts[(k, l)] += (float(estimated_counts) /
295 training_seq_prob)
296
297 return transition_counts
298
299 - def update_emissions(self, emission_counts, training_seq,
300 forward_vars, backward_vars, training_seq_prob):
301 """Add the contribution of a new training sequence to the emissions
302
303 Arguments:
304
305 o emission_counts -- A dictionary of the current counts for the
306 emissions
307
308 o training_seq -- The training sequence we are working with
309
310 o forward_vars -- Probabilities calculated using the forward
311 algorithm.
312
313 o backward_vars -- Probabilities calculated using the backwards
314 algorithm.
315
316 o training_seq_prob - The probability of the current sequence.
317
318 This calculates E_{k}(b) (the estimated emission probability for
319 emission letter b from state k) using formula 3.21 in Durbin et al.
320 """
321
322 for k in training_seq.states.alphabet.letters:
323
324 for b in training_seq.emissions.alphabet.letters:
325 expected_times = 0
326
327 for i in range(len(training_seq.emissions)):
328
329
330 if training_seq.emissions[i] == b:
331
332 expected_times += (forward_vars[(k, i)] *
333 backward_vars[(k, i)])
334
335
336 emission_counts[(k, b)] += (float(expected_times) /
337 training_seq_prob)
338
339 return emission_counts
340
342 """Estimate probabilities with known state sequences.
343
344 This should be used for direct estimation of emission and transition
345 probabilities when both the state path and emission sequence are
346 known for the training examples.
347 """
350
351 - def train(self, training_seqs):
352 """Estimate the Markov Model parameters with known state paths.
353
354 This trainer requires that both the state and the emissions are
355 known for all of the training sequences in the list of
356 TrainingSequence objects.
357 This training will then count all of the transitions and emissions,
358 and use this to estimate the parameters of the model.
359 """
360
361 transition_counts = self._markov_model.get_blank_transitions()
362 emission_counts = self._markov_model.get_blank_emissions()
363
364 for training_seq in training_seqs:
365 emission_counts = self._count_emissions(training_seq,
366 emission_counts)
367 transition_counts = self._count_transitions(training_seq.states,
368 transition_counts)
369
370
371 ml_transitions, ml_emissions = \
372 self.estimate_params(transition_counts,
373 emission_counts)
374 self._markov_model.transition_prob = ml_transitions
375 self._markov_model.emission_prob = ml_emissions
376
377 return self._markov_model
378
380 """Add emissions from the training sequence to the current counts.
381
382 Arguments:
383
384 o training_seq -- A TrainingSequence with states and emissions
385 to get the counts from
386
387 o emission_counts -- The current emission counts to add to.
388 """
389 for index in range(len(training_seq.emissions)):
390 cur_state = training_seq.states[index]
391 cur_emission = training_seq.emissions[index]
392
393 try:
394 emission_counts[(cur_state, cur_emission)] += 1
395 except KeyError:
396 raise KeyError("Unexpected emission (%s, %s)"
397 % (cur_state, cur_emission))
398 return emission_counts
399
401 """Add transitions from the training sequence to the current counts.
402
403 Arguments:
404
405 o state_seq -- A Seq object with the states of the current training
406 sequence.
407
408 o transition_counts -- The current transition counts to add to.
409 """
410 for cur_pos in range(len(state_seq) - 1):
411 cur_state = state_seq[cur_pos]
412 next_state = state_seq[cur_pos + 1]
413
414 try:
415 transition_counts[(cur_state, next_state)] += 1
416 except KeyError:
417 raise KeyError("Unexpected transition (%s, %s)" %
418 (cur_state, next_state))
419
420 return transition_counts
421