Package Bio :: Module MarkovModel
[hide private]
[frames] | no frames]

Source Code for Module Bio.MarkovModel

  1  """ 
  2  This is an implementation of a state-emitting MarkovModel.  I am using 
  3  terminology similar to Manning and Schutze. 
  4   
  5   
  6   
  7  Functions: 
  8  train_bw        Train a markov model using the Baum-Welch algorithm. 
  9  train_visible   Train a visible markov model using MLE. 
 10  find_states     Find the a state sequence that explains some observations. 
 11   
 12  load            Load a MarkovModel. 
 13  save            Save a MarkovModel. 
 14   
 15  Classes: 
 16  MarkovModel     Holds the description of a markov model 
 17  """ 
 18  import math 
 19   
 20  from Numeric import * 
 21  import RandomArray 
 22   
 23  import StringIO     # StringIO is in Numeric's namespace, so import this after. 
 24   
 25  from Bio import listfns 
 26   
 27   
 28  #RandomArray.seed(0, 0)   # use 0 for debugging 
 29  RandomArray.seed() 
 30   
 31  VERY_SMALL_NUMBER = 1E-300 
 32  LOG0 = log(VERY_SMALL_NUMBER) 
 33   
 34  MATCODE = Float64 
 35   
36 -class MarkovModel:
37 - def __init__(self, states, alphabet, 38 p_initial=None, p_transition=None, p_emission=None):
39 self.states = states 40 self.alphabet = alphabet 41 self.p_initial = p_initial 42 self.p_transition = p_transition 43 self.p_emission = p_emission
44 - def __str__(self):
45 handle = StringIO.StringIO() 46 save(self, handle) 47 handle.seek(0) 48 return handle.read()
49
50 -def _readline_and_check_start(handle, start):
51 line = handle.readline() 52 if not line.startswith(start): 53 raise ValueError, "I expected %r but got %r" % (start, line) 54 return line
55
56 -def load(handle):
57 """load(handle) -> MarkovModel()""" 58 # Load the states. 59 line = _readline_and_check_start(handle, "STATES:") 60 states = line.split()[1:] 61 62 # Load the alphabet. 63 line = _readline_and_check_start(handle, "ALPHABET:") 64 alphabet = line.split()[1:] 65 66 mm = MarkovModel(states, alphabet) 67 N, M = len(states), len(alphabet) 68 69 # Load the initial probabilities. 70 mm.p_initial = zeros(N, MATCODE) 71 line = _readline_and_check_start(handle, "INITIAL:") 72 for i in range(len(states)): 73 line = _readline_and_check_start(handle, " %s:" % states[i]) 74 mm.p_initial[i] = float(line.split()[-1]) 75 76 # Load the transition. 77 mm.p_transition = zeros((N, N), MATCODE) 78 line = _readline_and_check_start(handle, "TRANSITION:") 79 for i in range(len(states)): 80 line = _readline_and_check_start(handle, " %s:" % states[i]) 81 mm.p_transition[i,:] = map(float, line.split()[1:]) 82 83 # Load the emission. 84 mm.p_emission = zeros((N, M), MATCODE) 85 line = _readline_and_check_start(handle, "EMISSION:") 86 for i in range(len(states)): 87 line = _readline_and_check_start(handle, " %s:" % states[i]) 88 mm.p_emission[i,:] = map(float, line.split()[1:]) 89 90 return mm
91
92 -def save(mm, handle):
93 """save(mm, handle)""" 94 # This will fail if there are spaces in the states or alphabet. 95 w = handle.write 96 w("STATES: %s\n" % ' '.join(mm.states)) 97 w("ALPHABET: %s\n" % ' '.join(mm.alphabet)) 98 w("INITIAL:\n") 99 for i in range(len(mm.p_initial)): 100 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i])) 101 w("TRANSITION:\n") 102 for i in range(len(mm.p_transition)): 103 x = map(str, mm.p_transition[i]) 104 w(" %s: %s\n" % (mm.states[i], ' '.join(x))) 105 w("EMISSION:\n") 106 for i in range(len(mm.p_emission)): 107 x = map(str, mm.p_emission[i]) 108 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
109 110 # XXX allow them to specify starting points
111 -def train_bw(states, alphabet, training_data, 112 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None, 113 update_fn=None, 114 ):
115 """train_bw(states, alphabet, training_data[, pseudo_initial] 116 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel 117 118 Train a MarkovModel using the Baum-Welch algorithm. states is a list 119 of strings that describe the names of each state. alphabet is a 120 list of objects that indicate the allowed outputs. training_data 121 is a list of observations. Each observation is a list of objects 122 from the alphabet. 123 124 pseudo_initial, pseudo_transition, and pseudo_emission are 125 optional parameters that you can use to assign pseudo-counts to 126 different matrices. They should be matrices of the appropriate 127 size that contain numbers to add to each parameter matrix, before 128 normalization. 129 130 update_fn is an optional callback that takes parameters 131 (iteration, log_likelihood). It is called once per iteration. 132 133 """ 134 N, M = len(states), len(alphabet) 135 if not training_data: 136 raise ValueError, "No training data given." 137 pseudo_initial, pseudo_emission, pseudo_transition = map( 138 _safe_asarray, (pseudo_initial, pseudo_emission, pseudo_transition)) 139 if pseudo_initial and shape(pseudo_initial) != (N,): 140 raise ValueError, "pseudo_initial not shape len(states)" 141 if pseudo_transition and shape(pseudo_transition) != (N,N): 142 raise ValueError, "pseudo_transition not shape " + \ 143 "len(states) X len(states)" 144 if pseudo_emission and shape(pseudo_emission) != (N,M): 145 raise ValueError, "pseudo_emission not shape " + \ 146 "len(states) X len(alphabet)" 147 148 # Training data is given as a list of members of the alphabet. 149 # Replace those with indexes into the alphabet list for easier 150 # computation. 151 training_outputs = [] 152 indexes = listfns.itemindex(alphabet) 153 for outputs in training_data: 154 training_outputs.append([indexes[x] for x in outputs]) 155 156 # Do some sanity checking on the outputs. 157 lengths = map(len, training_outputs) 158 if min(lengths) == 0: 159 raise ValueError, "I got training data with outputs of length 0" 160 161 # Do the training with baum welch. 162 x = _baum_welch(N, M, training_outputs, 163 pseudo_initial=pseudo_initial, 164 pseudo_transition=pseudo_transition, 165 pseudo_emission=pseudo_emission, 166 update_fn=update_fn) 167 p_initial, p_transition, p_emission = x 168 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
169 170 MAX_ITERATIONS = 1000
171 -def _baum_welch(N, M, training_outputs, 172 p_initial=None, p_transition=None, p_emission=None, 173 pseudo_initial=None, pseudo_transition=None, 174 pseudo_emission=None, update_fn=None):
175 # Returns (p_initial, p_transition, p_emission) 176 p_initial = _safe_copy_and_check(p_initial, (N,)) or \ 177 _random_norm(N) 178 p_transition = _safe_copy_and_check(p_transition, (N,N)) or \ 179 _random_norm((N,N)) 180 p_emission = _safe_copy_and_check(p_emission, (N,M)) or \ 181 _random_norm((N,M)) 182 183 # Do all the calculations in log space to avoid underflows. 184 lp_initial, lp_transition, lp_emission = map( 185 log, (p_initial, p_transition, p_emission)) 186 lpseudo_initial, lpseudo_transition, lpseudo_emission = map( 187 _safe_log, (pseudo_initial, pseudo_transition, pseudo_emission)) 188 189 # Iterate through each sequence of output, updating the parameters 190 # to the HMM. Stop when the log likelihoods of the sequences 191 # stops varying. 192 prev_llik = None 193 for i in range(MAX_ITERATIONS): 194 llik = LOG0 195 for outputs in training_outputs: 196 x = _baum_welch_one( 197 N, M, outputs, 198 lp_initial, lp_transition, lp_emission, 199 lpseudo_initial, lpseudo_transition, lpseudo_emission,) 200 llik += x 201 if update_fn is not None: 202 update_fn(i, llik) 203 if prev_llik is not None and fabs(prev_llik-llik) < 0.1: 204 break 205 prev_llik = llik 206 else: 207 raise "HMM did not converge in %d iterations" % MAX_ITERATIONS 208 209 # Return everything back in normal space. 210 return map(exp, (lp_initial, lp_transition, lp_emission))
211
212 -def _baum_welch_one(N, M, outputs, 213 lp_initial, lp_transition, lp_emission, 214 lpseudo_initial, lpseudo_transition, lpseudo_emission):
215 # Do one iteration of Baum-Welch based on a sequence of output. 216 # NOTE: This will change the values of lp_initial, lp_transition, 217 # and lp_emission in place. 218 T = len(outputs) 219 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs) 220 bmat = _backward(N, T, lp_transition, lp_emission, outputs) 221 222 # Calculate the probability of traversing each arc for any given 223 # transition. 224 lp_arc = zeros((N, N, T), MATCODE) 225 for t in range(T): 226 k = outputs[t] 227 lp_traverse = zeros((N, N), MATCODE) # P going over one arc. 228 for i in range(N): 229 for j in range(N): 230 # P(getting to this arc) 231 # P(making this transition) 232 # P(emitting this character) 233 # P(going to the end) 234 lp = fmat[i][t] + \ 235 lp_transition[i][j] + \ 236 lp_emission[i][k] + \ 237 bmat[j][t+1] 238 lp_traverse[i][j] = lp 239 # Normalize the probability for this time step. 240 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse) 241 242 243 # Sum of all the transitions out of state i at time t. 244 lp_arcout_t = zeros((N, T), MATCODE) 245 for t in range(T): 246 for i in range(N): 247 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t]) 248 249 # Sum of all the transitions out of state i. 250 lp_arcout = zeros(N, MATCODE) 251 for i in range(N): 252 lp_arcout[i] = _logsum(lp_arcout_t[i,:]) 253 254 # UPDATE P_INITIAL. 255 lp_initial[:] = lp_arcout_t[:,0] 256 if lpseudo_initial: 257 lp_initial = _logvecadd(lp_initial, lpseudo_initial) 258 lp_initial = lp_initial - _logsum(lp_initial) 259 260 # UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the 261 # transitions from i to j, normalized by the sum of the 262 # transitions out of i. 263 for i in range(N): 264 for j in range(N): 265 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i] 266 if lpseudo_transition: 267 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition) 268 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i]) 269 270 # UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the 271 # transitions out of i when k is observed, divided by the sum of 272 # the transitions out of i. 273 for i in range(N): 274 ksum = zeros(M, MATCODE)+LOG0 # ksum[k] is the sum of all i with k. 275 for t in range(T): 276 k = outputs[t] 277 for j in range(N): 278 ksum[k] = _logadd(ksum[k], lp_arc[i,j,t]) 279 ksum = ksum - _logsum(ksum) # Normalize 280 if lpseudo_emission: 281 ksum = _logvecadd(ksum, lpseudo_emission[i]) 282 ksum = ksum - _logsum(ksum) # Renormalize 283 lp_emission[i,:] = ksum 284 285 # Calculate the log likelihood of the output based on the forward 286 # matrix. Since the parameters of the HMM has changed, the log 287 # likelihoods are going to be a step behind, and we might be doing 288 # one extra iteration of training. The alternative is to rerun 289 # the _forward algorithm and calculate from the clean one, but 290 # that may be more expensive than overshooting the training by one 291 # step. 292 return _logsum(fmat[:,T])
293
294 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
295 # Implement the forward algorithm. This actually calculates a 296 # Nx(T+1) matrix, where the last column is the total probability 297 # of the output. 298 299 matrix = zeros((N, T+1), MATCODE) 300 301 # Initialize the first column to be the initial values. 302 matrix[:,0] = lp_initial 303 for t in range(1, T+1): 304 k = outputs[t-1] 305 for j in range(N): 306 # The probability of the state is the sum of the 307 # transitions from all the states from time t-1. 308 lprob = LOG0 309 for i in range(N): 310 lp = matrix[i][t-1] + \ 311 lp_transition[i][j] + \ 312 lp_emission[i][k] 313 lprob = _logadd(lprob, lp) 314 matrix[j][t] = lprob 315 return matrix
316
317 -def _backward(N, T, lp_transition, lp_emission, outputs):
318 matrix = zeros((N, T+1), MATCODE) 319 for t in range(T-1, -1, -1): 320 k = outputs[t] 321 for i in range(N): 322 # The probability of the state is the sum of the 323 # transitions from all the states from time t+1. 324 lprob = LOG0 325 for j in range(N): 326 lp = matrix[j][t+1] + \ 327 lp_transition[i][j] + \ 328 lp_emission[i][k] 329 lprob = _logadd(lprob, lp) 330 matrix[i][t] = lprob 331 return matrix
332
333 -def train_visible(states, alphabet, training_data, 334 pseudo_initial=None, pseudo_transition=None, 335 pseudo_emission=None):
336 """train_visible(states, alphabet, training_data[, pseudo_initial] 337 [, pseudo_transition][, pseudo_emission]) -> MarkovModel 338 339 Train a visible MarkovModel using maximum likelihoood estimates 340 for each of the parameters. states is a list of strings that 341 describe the names of each state. alphabet is a list of objects 342 that indicate the allowed outputs. training_data is a list of 343 (outputs, observed states) where outputs is a list of the emission 344 from the alphabet, and observed states is a list of states from 345 states. 346 347 pseudo_initial, pseudo_transition, and pseudo_emission are 348 optional parameters that you can use to assign pseudo-counts to 349 different matrices. They should be matrices of the appropriate 350 size that contain numbers to add to each parameter matrix 351 352 """ 353 N, M = len(states), len(alphabet) 354 pseudo_initial, pseudo_emission, pseudo_transition = map( 355 _safe_asarray, (pseudo_initial, pseudo_emission, pseudo_transition)) 356 if pseudo_initial and shape(pseudo_initial) != (N,): 357 raise ValueError, "pseudo_initial not shape len(states)" 358 if pseudo_transition and shape(pseudo_transition) != (N,N): 359 raise ValueError, "pseudo_transition not shape " + \ 360 "len(states) X len(states)" 361 if pseudo_emission and shape(pseudo_emission) != (N,M): 362 raise ValueError, "pseudo_emission not shape " + \ 363 "len(states) X len(alphabet)" 364 365 # Training data is given as a list of members of the alphabet. 366 # Replace those with indexes into the alphabet list for easier 367 # computation. 368 training_states, training_outputs = [], [] 369 states_indexes = listfns.itemindex(states) 370 outputs_indexes = listfns.itemindex(alphabet) 371 for toutputs, tstates in training_data: 372 if len(tstates) != len(toutputs): 373 raise ValueError, "states and outputs not aligned" 374 training_states.append([states_indexes[x] for x in tstates]) 375 training_outputs.append([outputs_indexes[x] for x in toutputs]) 376 377 x = _mle(N, M, training_outputs, training_states, 378 pseudo_initial, pseudo_transition, pseudo_emission) 379 p_initial, p_transition, p_emission = x 380 381 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
382
383 -def _mle(N, M, training_outputs, training_states, pseudo_initial, 384 pseudo_transition, pseudo_emission):
385 # p_initial is the probability that a sequence of states starts 386 # off with a particular one. 387 p_initial = zeros(N, MATCODE) 388 if pseudo_initial: 389 p_initial = p_initial + pseudo_initial 390 for states in training_states: 391 p_initial[states[0]] += 1 392 p_initial = _normalize(p_initial) 393 394 # p_transition is the probability that a state leads to the next 395 # one. C(i,j)/C(i) where i and j are states. 396 p_transition = zeros((N,N), MATCODE) 397 if pseudo_transition: 398 p_transition = p_transition + pseudo_transition 399 for states in training_states: 400 for n in range(len(states)-1): 401 i, j = states[n], states[n+1] 402 p_transition[i, j] += 1 403 for i in range(len(p_transition)): 404 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:]) 405 406 # p_emission is the probability of an output given a state. 407 # C(s,o)|C(s) where o is an output and s is a state. 408 p_emission = zeros((N,M), MATCODE) 409 if pseudo_emission: 410 p_emission = p_emission + pseudo_emission 411 p_emission = ones((N,M), MATCODE) 412 for outputs, states in zip(training_outputs, training_states): 413 for o, s in zip(outputs, states): 414 p_emission[s, o] += 1 415 for i in range(len(p_emission)): 416 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:]) 417 418 return p_initial, p_transition, p_emission
419
420 -def _argmaxes(vector, allowance=None):
421 return [argmax(vector)]
422
423 -def find_states(markov_model, output):
424 """find_states(markov_model, output) -> list of (states, score)""" 425 mm = markov_model 426 N = len(mm.states) 427 428 # _viterbi does calculations in log space. Add a tiny bit to the 429 # matrices so that the logs will not break. 430 x = mm.p_initial + VERY_SMALL_NUMBER 431 y = mm.p_transition + VERY_SMALL_NUMBER 432 z = mm.p_emission + VERY_SMALL_NUMBER 433 lp_initial, lp_transition, lp_emission = map(log, (x, y, z)) 434 # Change output into a list of indexes into the alphabet. 435 indexes = listfns.itemindex(mm.alphabet) 436 output = [indexes[x] for x in output] 437 438 # Run the viterbi algorithm. 439 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output) 440 441 for i in range(len(results)): 442 states, score = results[i] 443 results[i] = [mm.states[x] for x in states], exp(score) 444 return results
445
446 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
447 # The Viterbi algorithm finds the most likely set of states for a 448 # given output. Returns a list of states. 449 450 T = len(output) 451 # Store the backtrace in a NxT matrix. 452 backtrace = [] # list of indexes of states in previous timestep. 453 for i in range(N): 454 backtrace.append([None] * T) 455 456 # Store the best scores. 457 scores = zeros((N, T), MATCODE) 458 scores[:,0] = lp_initial + lp_emission[:,output[0]] 459 for t in range(1, T): 460 k = output[t] 461 for j in range(N): 462 # Find the most likely place it came from. 463 i_scores = scores[:,t-1] + \ 464 lp_transition[:,j] + \ 465 lp_emission[j,k] 466 indexes = _argmaxes(i_scores) 467 scores[j,t] = i_scores[indexes[0]] 468 backtrace[j][t] = indexes 469 470 # Do the backtrace. First, find a good place to start. Then, 471 # we'll follow the backtrace matrix to find the list of states. 472 # In the event of ties, there may be multiple paths back through 473 # the matrix, which implies a recursive solution. We'll simulate 474 # it by keeping our own stack. 475 in_process = [] # list of (t, states, score) 476 results = [] # return values. list of (states, score) 477 indexes = _argmaxes(scores[:,T-1]) # pick the first place 478 for i in indexes: 479 in_process.append((T-1, [i], scores[i][T-1])) 480 while in_process: 481 t, states, score = in_process.pop() 482 if t == 0: 483 results.append((states, score)) 484 else: 485 indexes = backtrace[states[0]][t] 486 for i in indexes: 487 in_process.append((t-1, [i]+states, score)) 488 return results
489
490 -def _normalize(matrix):
491 # Make sure numbers add up to 1.0 492 if len(shape(matrix)) == 1: 493 matrix = matrix / float(sum(matrix)) 494 elif len(shape(matrix)) == 2: 495 # Normalize by rows. 496 for i in range(len(matrix)): 497 matrix[i,:] = matrix[i,:] / sum(matrix[i,:]) 498 else: 499 raise ValueError, "I cannot handle matrixes of that shape" 500 return matrix
501
502 -def _uniform_norm(shape):
503 matrix = ones(shape, MATCODE) 504 return _normalize(matrix)
505
506 -def _random_norm(shape):
507 matrix = asarray(RandomArray.random(shape), MATCODE) 508 return _normalize(matrix)
509
510 -def _copy_and_check(matrix, desired_shape):
511 # Copy the matrix. 512 matrix = array(matrix, MATCODE, copy=1) 513 # Check the dimensions. 514 if shape(matrix) != desired_shape: 515 raise ValuError, "Incorrect dimension" 516 # Make sure it's normalized. 517 if len(shape(matrix)) == 1: 518 if fabs(sum(matrix)-1.0) > 0.01: 519 raise ValueError, "matrix not normalized to 1.0" 520 elif len(shape(matrix)) == 2: 521 for i in range(len(matrix)): 522 if fabs(sum(matrix[i])-1.0) > 0.01: 523 raise ValueError, "matrix %d not normalized to 1.0" % i 524 else: 525 raise ValueError, "I don't handle matrices > 2 dimensions" 526 return matrix
527
528 -def _safe_copy_and_check(matrix, desired_shape):
529 if matrix is None: 530 return None 531 return _copy_and_check(matrix, desired_shape)
532
533 -def _safe_log(n):
534 if n is None: 535 return None 536 return log(n)
537
538 -def _safe_asarray(a, typecode=None):
539 if a is None: 540 return None 541 return asarray(a, typecode=typecode)
542
543 -def _logadd(logx, logy):
544 if logy - logx > 100: 545 return logy 546 elif logx - logy > 100: 547 return logx 548 minxy = min(logx, logy) 549 return minxy + log(exp(logx-minxy) + exp(logy-minxy))
550
551 -def _logsum(matrix):
552 if len(shape(matrix)) > 1: 553 vec = reshape(matrix, (product(shape(matrix)),)) 554 else: 555 vec = matrix 556 sum = LOG0 557 for num in vec: 558 sum = _logadd(sum, num) 559 return sum
560
561 -def _logvecadd(logvec1, logvec2):
562 assert len(logvec1) == len(logvec2), "vectors aren't the same length" 563 sumvec = zeros(len(logvec1), MATCODE) 564 for i in range(len(logvec1)): 565 sumvec[i] = _logadd(logvec1[i], logvec2[i]) 566 return sumvec
567
568 -def _exp_logsum(numbers):
569 sum = _logsum(numbers) 570 return math.exp(sum)
571 572 try: 573 import cMarkovModel 574 except ImportError, x: 575 pass 576 else: 577 import sys 578 this_module = sys.modules[__name__] 579 for name in cMarkovModel.__dict__.keys(): 580 if not name.startswith("__"): 581 this_module.__dict__[name] = cMarkovModel.__dict__[name] 582