├── README.md └── submission.py /README.md: -------------------------------------------------------------------------------- 1 | # Hybrid DNN-HMM model for isolated digit recognition 2 | Python implementation of a hybrid DNN-HMM models for isolated digit recognition. 3 | 4 | Forced alignments are obtained from a GMM-HMM model and used to train the DNN. 5 | The DNN is a simple multi-layer perceptron (MLP) implemented using scikit-learn. 6 | 7 | ### How to run 8 | 9 | ``` 10 | python3 submission.py train test 11 | ``` 12 | 13 | * `train` is the training data 14 | * `test` is the test data 15 | 16 | The optional arguments are: 17 | * `--mode`: Type of model (`mlp`, `hmm`). Default: `mlp` 18 | * `--niter`: Number of iterations to train the HMM. Default = 10 19 | * `--nstate`: Number of states in HMM model. Default = 5 20 | * `--nepoch`: Maximum number of epochs for training the MLP. Default=10 21 | * `--lr`: Learning rate for the MLP. Default=0.01 22 | * `--debug`: Uses only top 100 utterances for train and test 23 | 24 | ### Training data format 25 | 26 | I cannot upload the full training and test data (for copyright reasons), but a small sample of the training data can be found at this [Google Drive link](https://drive.google.com/file/d/1NhF7fuX54jau9iXxuitOfm9QRQPHNW2Q/view?usp=sharing). This should help in understanding the format of the data. 27 | 28 | ### Help 29 | 30 | This code is based on a template provided by Shinji Watanabe (Johns Hopkins University), written for a course project. 31 | 32 | For assistance, contact `draj@cs.jhu.edu`. 33 | 34 | -------------------------------------------------------------------------------- /submission.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2018 Johns Hopkins University (Shinji Watanabe) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import argparse 7 | import logging 8 | import numpy as np 9 | import pickle 10 | 11 | # neural network related 12 | from sklearn.neural_network import MLPClassifier 13 | 14 | def elog(x): 15 | res = np.log(x, where=(x!=0)) 16 | res[np.where(x==0)] = -(10.0**8) 17 | return (res) 18 | 19 | def get_data_dict(data): 20 | data_dict = {} 21 | for line in data: 22 | if "[" in line: 23 | key = line.split()[0] 24 | mat = [] 25 | elif "]" in line: 26 | line = line.split(']')[0] 27 | mat.append([float(x) for x in line.split()]) 28 | data_dict[key]=np.array(mat) 29 | else: 30 | mat.append([float(x) for x in line.split()]) 31 | return data_dict 32 | 33 | 34 | def logSumExp(x, axis=None, keepdims=False): 35 | x_max = np.max(x, axis=axis, keepdims=keepdims) 36 | x_diff = x - x_max 37 | sumexp = np.exp(x_diff).sum(axis=axis, keepdims=keepdims) 38 | return (x_max + np.log(sumexp)) 39 | 40 | def exp_normalize(x, axis=None, keepdims=False): 41 | b = x.max(axis=axis, keepdims=keepdims) 42 | y = np.exp(x - b) 43 | return y / y.sum(axis=axis, keepdims=keepdims) 44 | 45 | def compute_ll(data, mu, r): 46 | # Compute log-likelihood of a single n-dimensional data point, given a single 47 | # mean and variance 48 | ll = (- 0.5*elog(r) - np.divide( 49 | np.square(data - mu), 2*r) -0.5*np.log(2*np.pi)).sum() 50 | return ll 51 | 52 | def forward(pi, a, o, mu, r): 53 | """ 54 | Computes forward log-probabilities of all states 55 | at all time steps. 56 | Inputs: 57 | pi: initial probability over states 58 | a: transition matrix 59 | o: observed n-dimensional data sequence 60 | mu: means of Gaussians for each state 61 | r: variances of Gaussians for each state 62 | """ 63 | T = o.shape[0] 64 | J = mu.shape[0] 65 | 66 | log_alpha = np.zeros((T,J)) 67 | log_alpha[0] = elog(pi) 68 | 69 | log_alpha[0] += np.array([compute_ll(o[0],mu[j],r[j]) 70 | for j in range(J)]) 71 | 72 | for t in range(1,T): 73 | for j in range(J): 74 | log_alpha[t,j] = compute_ll(o[t],mu[j],r[j]) + logSumExp(elog(a[:,j].T) + log_alpha[t-1]) 75 | 76 | return log_alpha 77 | 78 | def backward(a, o, mu, r): 79 | """ 80 | Computes backward log-probabilities of all states 81 | at all time steps. 82 | Inputs: 83 | a: transition matrix 84 | o: observed n-dimensional data 85 | mu: means of Gaussians for each state 86 | r: variances of Gaussians for each state 87 | """ 88 | T = o.shape[0] 89 | J = mu.shape[0] 90 | log_beta = np.zeros((T,J)) 91 | 92 | log_a = elog(a) 93 | 94 | for t in reversed(range(T-1)): 95 | for i in range(J): 96 | x = [] 97 | for j in range(J): 98 | x.append(compute_ll(o[t+1], mu[j], r[j]) + log_beta[t+1,j] + log_a[i,j]) 99 | 100 | log_beta[t,i] = logSumExp(np.array(x)) 101 | 102 | return log_beta 103 | 104 | def getExpandedData(data): 105 | T = data.shape[0] 106 | 107 | data_0 = np.copy(data[0]) 108 | data_T = np.copy(data[T-1]) 109 | 110 | for i in range(3): 111 | data = np.insert(data, 0, data_0, axis=0) 112 | data = np.insert(data, -1, data_T, axis=0) 113 | 114 | data_expanded = np.zeros((T,7*data.shape[1])) 115 | for t in range(3, T+3): 116 | np.concatenate((data[t-3], data[t-2], data[t-1], data[t], 117 | data[t+1], data[t+2], data[t+3]), out=data_expanded[t-3]) 118 | 119 | return (data_expanded) 120 | 121 | class SingleGauss(): 122 | def __init__(self): 123 | # Basic class variable initialized, feel free to add more 124 | self.dim = None 125 | self.mu = None 126 | self.r = None 127 | 128 | def train(self, data): 129 | # Function for training single modal Gaussian 130 | T, self.dim = data.shape 131 | 132 | self.mu = np.mean(data, axis=0) 133 | self.r = np.mean(np.square(np.subtract(data, self.mu)), axis=0) 134 | return 135 | 136 | def loglike(self, data_mat): 137 | # Function for calculating log likelihood of single modal Gaussian 138 | lls = [compute_ll(frame, self.mu, self.r) for frame in data_mat.tolist()] 139 | ll = np.sum(np.array(lls)) 140 | return ll 141 | 142 | 143 | class HMM(): 144 | 145 | def __init__(self, sg_model, nstate): 146 | # Basic class variable initialized, feel free to add more 147 | self.pi = np.zeros(nstate) 148 | self.pi[0] = 1 149 | self.nstate = nstate 150 | 151 | self.mu = np.tile(sg_model.mu, (nstate,1)) 152 | self.r = np.tile(sg_model.r, (nstate,1)) 153 | 154 | 155 | def initStates(self, data): 156 | self.states = [] 157 | for data_u in data: 158 | T = data_u.shape[0] 159 | state_seq = np.array([self.nstate*t/T for t in range(T)], dtype=int) 160 | self.states.append(state_seq) 161 | 162 | def getStateSeq(self, data): 163 | T = data.shape[0] 164 | J = self.nstate 165 | s_hat = np.zeros(T, dtype=int) 166 | 167 | log_delta = np.zeros((T,J)) 168 | psi = np.zeros((T,J)) 169 | 170 | log_delta[0] = elog(self.pi) 171 | for j in range(J): 172 | log_delta[0,j] += compute_ll(data[0], self.mu[j], self.r[j]) 173 | 174 | log_A = elog(self.A) 175 | 176 | for t in range(1,T): 177 | for j in range(J): 178 | temp = np.zeros(J) 179 | for i in range(J): 180 | temp[i] = log_delta[t-1,i] + log_A[i,j] + compute_ll(data[t], self.mu[j], self.r[j]) 181 | log_delta[t,j] = np.max(temp) 182 | psi[t,j] = np.argmax(log_delta[t-1]+log_A[:,j]) 183 | 184 | 185 | s_hat[T-1] = np.argmax(log_delta[T-1]) 186 | 187 | for t in reversed(range(T-1)): 188 | s_hat[t] = psi[t+1,s_hat[t+1]] 189 | 190 | return s_hat 191 | 192 | 193 | def viterbi(self, data): 194 | for u,data_u in enumerate(data): 195 | s_hat = self.getStateSeq(data_u) 196 | self.states[u] = s_hat 197 | 198 | 199 | def m_step(self, data): 200 | 201 | self.A = np.zeros((self.nstate,self.nstate)) 202 | 203 | gamma_0 = np.zeros(self.nstate) 204 | gamma_1 = np.zeros((self.nstate, data[0].shape[1])) 205 | gamma_2 = np.zeros((self.nstate, data[0].shape[1])) 206 | 207 | for u, data_u in enumerate(data): 208 | T = data_u.shape[0] 209 | seq = self.states[u] 210 | gamma = np.zeros((T, self.nstate)) 211 | 212 | for t,j in enumerate(seq[:-1]): 213 | self.A[j,seq[t+1]] += 1 214 | gamma[t,j] = 1 215 | 216 | gamma[T-1,self.nstate-1] = 1 217 | gamma_0 += np.sum(gamma, axis=0) 218 | 219 | for t in range(T): 220 | gamma_1[seq[t]] += data_u[t] 221 | gamma_2[seq[t]] += np.square(data_u[t]) 222 | 223 | gamma_0 = np.expand_dims(gamma_0, axis=1) 224 | self.mu = gamma_1 / gamma_0 225 | self.r = (gamma_2 - np.multiply(gamma_0, self.mu**2))/ gamma_0 226 | 227 | for j in range(self.nstate): 228 | self.A[j] /= np.sum(self.A[j]) 229 | 230 | 231 | 232 | def train(self, data, iter): 233 | # Function for training single modal Gaussian 234 | if (iter==0): 235 | self.initStates(data) 236 | self.m_step(data) 237 | self.viterbi(data) 238 | 239 | 240 | def loglike(self, data): 241 | # Function for calculating log likelihood of single modal Gaussian 242 | T = data.shape[0] 243 | log_alpha_t = forward(self.pi, self.A, data, self.mu, self.r)[T-1] 244 | ll = logSumExp(log_alpha_t) 245 | 246 | return ll 247 | 248 | 249 | class HMMMLP(): 250 | 251 | def __init__(self, mlp, hmm_model, S, uniq_state_dict): 252 | # Basic class variable initialized, feel free to add more 253 | self.mlp = mlp 254 | self.hmm = hmm_model 255 | self.log_prior = self.computeLogPrior(S) 256 | self.uniq_state_dict = uniq_state_dict 257 | 258 | 259 | def computeLogPrior(self, S): 260 | states, counts = np.unique(S, return_counts=True) 261 | p = np.zeros(len(states)) 262 | for s,c in zip(states,counts): 263 | p[s] = c 264 | p /= np.sum(p) 265 | return elog(p) 266 | 267 | def mlp_predict(self, o): 268 | o_expanded = getExpandedData(o) 269 | return (self.mlp.predict_log_proba(o_expanded)) 270 | 271 | 272 | def forward_dnn(self, pi, a, o, digit): 273 | 274 | T = o.shape[0] 275 | J = len(pi) 276 | 277 | log_alpha = np.zeros((T,J)) 278 | log_alpha[0] = elog(pi) 279 | 280 | mlp_ll = self.mlp_predict(o) 281 | 282 | 283 | log_alpha[0] += np.array([mlp_ll[0][self.uniq_state_dict[(digit,j)]] + self.log_prior[self.uniq_state_dict[(digit,j)]] 284 | for j in range(J)]) 285 | 286 | for t in range(1,T): 287 | for j in range(J): 288 | mlp_ll_t = mlp_ll[t][self.uniq_state_dict[(digit,j)]] + self.log_prior[self.uniq_state_dict[(digit,j)]] 289 | log_alpha[t,j] = mlp_ll_t + logSumExp(elog(a[:,j].T) + log_alpha[t-1]) 290 | 291 | return log_alpha 292 | 293 | def loglike(self, data, digit): 294 | T = data.shape[0] 295 | log_alpha_t = self.forward_dnn(self.hmm.pi, self.hmm.A, data, digit)[T-1] 296 | ll = logSumExp(log_alpha_t) 297 | 298 | return ll 299 | 300 | 301 | 302 | def sg_train(digits, train_data): 303 | model = {} 304 | for digit in digits: 305 | model[digit] = SingleGauss() 306 | 307 | for digit in digits: 308 | data = np.vstack([train_data[id] for id in train_data.keys() if digit in id.split('_')[1]]) 309 | logging.info("process %d data for digit %s", len(data), digit) 310 | model[digit].train(data) 311 | 312 | return model 313 | 314 | 315 | def hmm_train(digits, train_data, sg_model, nstate, niter): 316 | logging.info("hidden Markov model training, %d states, %d iterations", nstate, niter) 317 | 318 | hmm_model = {} 319 | data_dict = {} 320 | for digit in digits: 321 | hmm_model[digit] = HMM(sg_model[digit], nstate=nstate) 322 | data = [train_data[id] for id in train_data.keys() if digit in id.split('_')[1]] 323 | data_dict[digit] = data 324 | 325 | 326 | i = 0 327 | while i < niter: 328 | logging.info("iteration: %d", i) 329 | total_log_like = 0.0 330 | total_count = 0.0 331 | for digit in digits: 332 | data = data_dict[digit] 333 | logging.info("process %d data for digit %s", len(data), digit) 334 | 335 | hmm_model[digit].train(data, i) 336 | 337 | for data_u in data: 338 | total_log_like += hmm_model[digit].loglike(data_u) 339 | 340 | logging.info("log likelihood: %f", total_log_like) 341 | i += 1 342 | 343 | return hmm_model 344 | 345 | 346 | def mlp_train(digits, train_data, hmm_model, uniq_state_dict, nepoch, lr, nunits=(256, 256)): 347 | 348 | #TODO: Complete the function to train MLP and create HMMMLP object for each digit 349 | # Get unique output IDs for MLP, perform alignment to get labels and perform context expansion 350 | data_dict = {} 351 | 352 | # Get unique state sequences 353 | seq_dict = {} 354 | 355 | for digit in digits: 356 | uniq = lambda t: uniq_state_dict[(digit, t)] 357 | vfunc = np.vectorize(uniq) 358 | 359 | sequences = [] 360 | data = [train_data[id] for id in train_data.keys() if digit in id.split('_')[1]] 361 | data_dict[digit] = data 362 | 363 | for data_u in data: 364 | seq = hmm_model[digit].getStateSeq(data_u) 365 | sequences.append(vfunc(seq)) 366 | seq_dict[digit] = sequences 367 | 368 | # Perform context expansion and create large training matrix and labels 369 | O = [] 370 | S = [] 371 | for digit in digits: 372 | data = data_dict[digit] 373 | sequences = seq_dict[digit] 374 | for data_u, seq in zip(data, sequences): 375 | data_u_expanded = getExpandedData(data_u) 376 | O.append(data_u_expanded) 377 | S.append(seq) 378 | 379 | O = np.vstack(O) 380 | S = np.concatenate(S, axis=0) 381 | 382 | 383 | #TODO: A simple scikit-learn MLPClassifier call is given below, check other arguments and play with it 384 | #OPTIONAL: Try pytorch instead of scikit-learn MLPClassifier 385 | mlp = MLPClassifier(hidden_layer_sizes=nunits, random_state=1, early_stopping=True, verbose=True, 386 | validation_fraction=0.1) 387 | 388 | mlp.fit(O,S) 389 | 390 | mlp_model = {} 391 | for digit in digits: 392 | #TODO: variables to initialize HMMMLP are incomplete below, pass additional variables that are required 393 | mlp_model[digit] = HMMMLP(mlp, hmm_model[digit], S, uniq_state_dict) 394 | 395 | return mlp_model 396 | 397 | 398 | if __name__ == '__main__': 399 | parser = argparse.ArgumentParser() 400 | parser.add_argument('train', type=str, help='training data') 401 | parser.add_argument('test', type=str, help='test data') 402 | parser.add_argument('--niter', type=int, default=10) 403 | parser.add_argument('--nstate', type=int, default=5) 404 | parser.add_argument('--nepoch', type=int, default=10) 405 | parser.add_argument('--lr', type=int, default=0.01) 406 | parser.add_argument('--mode', type=str, default='mlp', 407 | choices=['hmm', 'mlp'], 408 | help='Type of models') 409 | parser.add_argument('--debug', action='store_true') 410 | args = parser.parse_args() 411 | 412 | # set seed 413 | np.random.seed(777) 414 | 415 | # logging info 416 | log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s" 417 | logging.basicConfig(level=logging.INFO, format=log_format) 418 | 419 | digits = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "z", "o"] 420 | uniq_state_dict = {} 421 | i=0 422 | for digit in digits: 423 | for state in range(args.nstate): 424 | uniq_state_dict[(digit, state)] = i 425 | i += 1 426 | 427 | # read training data 428 | with open(args.train) as f: 429 | train_data = get_data_dict(f.readlines()) 430 | # for debug 431 | if args.debug: 432 | train_data = {key:train_data[key] for key in list(train_data.keys())[:200]} 433 | 434 | # read test data 435 | with open(args.test) as f: 436 | test_data = get_data_dict(f.readlines()) 437 | # for debug 438 | if args.debug: 439 | test_data = {key:test_data[key] for key in list(test_data.keys())[:200]} 440 | 441 | # Single Gaussian 442 | sg_model = sg_train(digits, train_data) 443 | 444 | if args.mode == 'hmm': 445 | try: 446 | model = pickle.load(open('hmm.pickle','rb')) 447 | except: 448 | model = hmm_train(digits, train_data, sg_model, args.nstate, args.niter) 449 | pickle.dump(model, open('hmm.pickle','wb')) 450 | elif args.mode == 'mlp': 451 | try: 452 | hmm_model = pickle.load(open('hmm.pickle','rb')) 453 | except: 454 | hmm_model = hmm_train(digits, train_data, sg_model, args.nstate, args.niter) 455 | pickle.dump(hmm_model, open('hmm.pickle','wb')) 456 | #TODO: Modify MLP training function call with appropriate arguments here 457 | model = mlp_train(digits, train_data, hmm_model, uniq_state_dict, nepoch=args.nepoch, lr=args.lr, 458 | nunits=(256, 256)) 459 | 460 | # test 461 | total_count = 0 462 | correct = 0 463 | for key in test_data.keys(): 464 | lls = [] 465 | for digit in digits: 466 | ll = model[digit].loglike(test_data[key], digit) 467 | lls.append(ll) 468 | predict = digits[np.argmax(np.array(lls))] 469 | log_like = np.max(np.array(lls)) 470 | 471 | logging.info("predict %s for utt %s (log like = %f)", predict, key, log_like) 472 | if predict in key.split('_')[1]: 473 | correct += 1 474 | total_count += 1 475 | 476 | logging.info("accuracy: %f", float(correct)/total_count * 100) 477 | --------------------------------------------------------------------------------