├── TODO ├── Makefile ├── README.md ├── fhmm.c ├── hmm.c └── hmm.cu /TODO: -------------------------------------------------------------------------------- 1 | TODO list 2 | --- 3 | 4 | * urgent 5 | - simplify the code: use CPU for any procedure runs in O(N x M) 6 | - modified backward pass to avoid large memory move 7 | - better matrix mutiplication (maybe use CUBLAS library) 8 | 9 | * further improvement 10 | - 2D, 3D reducing 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | INC = -I$(CUDA_SDK_PATH)/samples/common/inc 2 | LIB = -L$(CUDA_SDK_PATH)/lib64 3 | PG = -Xcompiler -DPROFILE_PG 4 | PGPU = -Xcompiler -DPROFILE_GPU 5 | 6 | all: cuhmm hmm fhmm 7 | 8 | cuhmm: hmm.cu 9 | nvcc $(INC) $(LIB) hmm.cu -o cuhmm 10 | 11 | hmm: hmm.c 12 | gcc hmm.c -Wall -lm -o hmm 13 | 14 | fhmm: fhmm.c 15 | gcc fhmm.c -Wall -lm -o fhmm 16 | 17 | clean: hmm 18 | rm -f hmm cuhmm fhmm 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This is an implementation of hidden Markov model (HMM) training and classification for NVIDIA CUDA platform. A serial implementation in C is also included for comparison. 3 | 4 | The implementation of HMM follows the tutorial paper by Rabiner. The three problem for HMM defined in the paper are: 5 | 6 | 1. compute the probability of the observation sequence 7 | 2. compute the most probable sequence 8 | 3. train hidden Markov mode parameters 9 | 10 | This implementation supports all the three problems. However there is no support for continuous densities. 11 | 12 | # Usage 13 | The command line usage is as follows. 14 | 15 | ``` 16 | $ ./hmm -h 17 | hmm [-hnt] [-c config] [-p(1|2|3)] 18 | usage: 19 | -h help 20 | -c configuration file 21 | -t output computation time 22 | -p1 compute the probability of the observation sequence 23 | -p2 compute the most probable sequence (Viterbi) 24 | -p3 train hidden Markov mode parameters (Baum-Welch) 25 | -n number of iterations 26 | ``` 27 | 28 | # Configuration 29 | The configuration format is same for all the three problems. For problem 1, the forward probabilities for all input sequences are computed from the given model parameters. For problem 2, decoding is performed for all sequences based on the given parameters. For problem 3, the given input are used as training data. 30 | 31 | The following example shows a 16 states HMM with 2 observation symbols and 32 input sequences. Empty line and line begins with # will be ignored. The order of parameters follows the example. 32 | 33 | ``` 34 | # a HMM model configuration for testing purpose 35 | 36 | # number of states 37 | 16 38 | 39 | # number of output 40 | 2 41 | 42 | # initial state probability 43 | 0.04 0.02 0.06 0.04 0.11 0.11 0.01 0.09 0.03 0.05 0.06 0.11 0.05 0.11 0.03 0.08 44 | 45 | # state transition probability 46 | 0.08 0.02 0.10 0.05 0.07 0.08 0.07 0.04 0.08 0.10 0.07 0.02 0.01 0.10 0.09 0.01 47 | 0.06 0.10 0.11 0.01 0.04 0.11 0.04 0.07 0.08 0.10 0.08 0.02 0.09 0.05 0.02 0.02 48 | 0.08 0.07 0.08 0.07 0.01 0.03 0.10 0.02 0.07 0.03 0.06 0.08 0.03 0.10 0.10 0.08 49 | 0.08 0.04 0.04 0.05 0.07 0.08 0.01 0.08 0.10 0.07 0.11 0.01 0.05 0.04 0.11 0.06 50 | 0.03 0.03 0.08 0.10 0.11 0.04 0.06 0.03 0.03 0.08 0.03 0.07 0.10 0.11 0.07 0.03 51 | 0.02 0.05 0.01 0.09 0.05 0.09 0.05 0.12 0.09 0.07 0.01 0.07 0.05 0.05 0.11 0.06 52 | 0.11 0.05 0.10 0.07 0.01 0.08 0.05 0.03 0.03 0.10 0.01 0.10 0.08 0.09 0.07 0.02 53 | 0.03 0.02 0.16 0.01 0.05 0.01 0.14 0.14 0.02 0.05 0.01 0.09 0.07 0.14 0.03 0.01 54 | 0.01 0.09 0.13 0.01 0.02 0.04 0.05 0.03 0.10 0.05 0.06 0.06 0.11 0.06 0.03 0.14 55 | 0.09 0.03 0.04 0.05 0.04 0.03 0.12 0.04 0.07 0.02 0.07 0.10 0.11 0.03 0.06 0.09 56 | 0.09 0.04 0.06 0.06 0.05 0.07 0.05 0.01 0.05 0.10 0.04 0.08 0.05 0.08 0.08 0.10 57 | 0.07 0.06 0.01 0.07 0.06 0.09 0.01 0.06 0.07 0.07 0.08 0.06 0.01 0.11 0.09 0.05 58 | 0.03 0.04 0.06 0.06 0.06 0.05 0.02 0.10 0.11 0.07 0.09 0.05 0.05 0.05 0.11 0.08 59 | 0.04 0.03 0.04 0.09 0.10 0.09 0.08 0.06 0.04 0.07 0.09 0.02 0.05 0.08 0.04 0.09 60 | 0.05 0.07 0.02 0.08 0.06 0.08 0.05 0.05 0.07 0.06 0.10 0.07 0.03 0.05 0.06 0.10 61 | 0.11 0.03 0.02 0.11 0.11 0.01 0.02 0.08 0.05 0.08 0.11 0.03 0.02 0.10 0.01 0.11 62 | 63 | # state output probability 64 | 0.01 0.99 65 | 0.58 0.42 66 | 0.48 0.52 67 | 0.58 0.42 68 | 0.37 0.63 69 | 0.33 0.67 70 | 0.51 0.49 71 | 0.28 0.72 72 | 0.35 0.65 73 | 0.61 0.39 74 | 0.97 0.03 75 | 0.87 0.13 76 | 0.46 0.54 77 | 0.55 0.45 78 | 0.23 0.77 79 | 0.76 0.24 80 | 81 | # data size 82 | 32 10 83 | 84 | # data 85 | 0 0 0 0 0 0 1 0 1 1 86 | 1 1 0 0 1 1 1 0 0 0 87 | 1 1 0 1 0 0 0 1 0 1 88 | 1 1 1 1 1 0 1 1 1 0 89 | 0 1 0 1 1 0 1 1 1 1 90 | 1 0 1 1 0 1 0 1 1 1 91 | 1 0 1 1 1 1 0 0 1 1 92 | 0 1 0 1 1 1 0 0 0 0 93 | 0 1 1 0 0 0 1 1 1 1 94 | 0 1 1 0 0 0 0 1 1 0 95 | 1 1 1 1 1 0 1 1 0 0 96 | 0 0 0 0 1 1 0 1 1 1 97 | 1 0 1 0 1 1 1 1 1 0 98 | 1 0 0 1 1 1 0 0 0 0 99 | 0 0 1 1 1 0 0 0 0 1 100 | 1 0 1 1 0 1 0 1 0 0 101 | 1 0 1 0 1 0 0 1 0 1 102 | 0 0 0 1 0 0 0 1 0 1 103 | 1 1 1 0 0 0 0 1 0 0 104 | 0 1 0 1 1 1 1 1 1 1 105 | 0 0 0 0 1 1 1 0 1 0 106 | 0 1 1 1 0 1 0 1 0 0 107 | 1 1 0 1 1 0 0 0 0 1 108 | 0 0 0 0 1 1 0 0 1 1 109 | 0 1 0 1 1 1 1 1 0 0 110 | 0 1 1 1 0 1 1 0 1 1 111 | 1 1 1 1 0 0 1 1 0 0 112 | 1 1 0 1 1 0 0 0 0 0 113 | 0 1 0 0 0 0 0 0 0 1 114 | 1 0 0 1 0 1 0 0 1 1 115 | 0 1 0 1 0 0 1 1 0 0 116 | 0 0 1 0 1 1 1 1 0 0 117 | ``` 118 | 119 | # Further Information 120 | For more detailed information, please refer to the report at [https://liuchuan.org/pub/cuHMM.pdf](https://www.liuchuan.org/pub/cuHMM.pdf) 121 | -------------------------------------------------------------------------------- /fhmm.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, Chuan Liu 3 | * 4 | * Permission is hereby granted, free of charge, to any person 5 | * obtaining a copy of this software and associated documentation 6 | * files (the "Software"), to deal in the Software without 7 | * restriction, including without limitation the rights to use, copy, 8 | * modify, merge, publish, distribute, sublicense, and/or sell copies 9 | * of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | * 24 | */ 25 | 26 | #ifndef _GNU_SOURCE 27 | #define _GNU_SOURCE 28 | #endif 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #define handle_error(msg) \ 36 | do { perror(msg); exit(EXIT_FAILURE); } while (0) 37 | 38 | #define IDX(i,j,d) (((i)*(d))+(j)) 39 | 40 | 41 | int nstates = 0; /* number of states */ 42 | int nobvs = 0; /* number of observations */ 43 | int nseq = 0; /* number of data sequences */ 44 | int length = 0; /* data sequencel length */ 45 | float *prior = NULL; /* initial state probabilities */ 46 | float *trans = NULL; /* state transition probabilities */ 47 | float *obvs = NULL; /* output probabilities */ 48 | int *data = NULL; 49 | float *gmm = NULL; /* gamma */ 50 | float *xi = NULL; /* xi */ 51 | float *pi = NULL; /* pi */ 52 | 53 | float logadd(float, float); 54 | float sumf(float *, int); 55 | float forward_backward(int *, size_t, int); 56 | void viterbi(int *, size_t); 57 | void init_count(); 58 | void update_prob(); 59 | void usage(); 60 | void freeall(); 61 | 62 | int main(int argc, char *argv[]) 63 | { 64 | char *configfile = NULL; 65 | FILE *fin, *bin; 66 | 67 | char *linebuf = NULL; 68 | size_t buflen = 0; 69 | 70 | int iterations = 3; 71 | int mode = 3; 72 | 73 | int c; 74 | float d; 75 | float *loglik; 76 | float p; 77 | int i, j, k; 78 | opterr = 0; 79 | 80 | 81 | while ((c = getopt(argc, argv, "c:n:hp:")) != -1) { 82 | switch (c) { 83 | case 'c': 84 | configfile = optarg; 85 | break; 86 | case 'h': 87 | usage(); 88 | exit(EXIT_SUCCESS); 89 | case 'n': 90 | iterations = atoi(optarg); 91 | break; 92 | case 'p': 93 | mode = atoi(optarg); 94 | if (mode != 1 && mode != 2 && mode != 3) { 95 | fprintf(stderr, "illegal mode: %d\n", mode); 96 | exit(EXIT_FAILURE); 97 | } 98 | break; 99 | case '?': 100 | fprintf(stderr, "illegal options\n"); 101 | exit(EXIT_FAILURE); 102 | default: 103 | abort(); 104 | } 105 | } 106 | 107 | if (configfile == NULL) { 108 | fin = stdin; 109 | } else { 110 | fin = fopen(configfile, "r"); 111 | if (fin == NULL) { 112 | handle_error("fopen"); 113 | } 114 | } 115 | 116 | i = 0; 117 | while ((c = getline(&linebuf, &buflen, fin)) != -1) { 118 | if (c <= 1 || linebuf[0] == '#') 119 | continue; 120 | 121 | if (i == 0) { 122 | if (sscanf(linebuf, "%d", &nstates) != 1) { 123 | fprintf(stderr, "config file format error: %d\n", i); 124 | freeall(); 125 | exit(EXIT_FAILURE); 126 | } 127 | 128 | prior = (float *) malloc(sizeof(float) * nstates); 129 | if (prior == NULL) handle_error("malloc"); 130 | 131 | trans = (float *) malloc(sizeof(float) * nstates * nstates); 132 | if (trans == NULL) handle_error("malloc"); 133 | 134 | xi = (float *) malloc(sizeof(float) * nstates * nstates); 135 | if (xi == NULL) handle_error("malloc"); 136 | 137 | pi = (float *) malloc(sizeof(float) * nstates); 138 | if (pi == NULL) handle_error("malloc"); 139 | 140 | } else if (i == 1) { 141 | if (sscanf(linebuf, "%d", &nobvs) != 1) { 142 | fprintf(stderr, "config file format error: %d\n", i); 143 | freeall(); 144 | exit(EXIT_FAILURE); 145 | } 146 | 147 | obvs = (float *) malloc(sizeof(float) * nstates * nobvs); 148 | if (obvs == NULL) handle_error("malloc"); 149 | 150 | gmm = (float *) malloc(sizeof(float) * nstates * nobvs); 151 | if (gmm == NULL) handle_error("malloc"); 152 | 153 | } else if (i == 2) { 154 | /* read initial state probabilities */ 155 | bin = fmemopen(linebuf, buflen, "r"); 156 | if (bin == NULL) handle_error("fmemopen"); 157 | for (j = 0; j < nstates; j++) { 158 | if (fscanf(bin, "%f", &d) != 1) { 159 | fprintf(stderr, "config file format error: %d\n", i); 160 | freeall(); 161 | exit(EXIT_FAILURE); 162 | } 163 | prior[j] = logf(d); 164 | } 165 | fclose(bin); 166 | 167 | } else if (i <= 2 + nstates) { 168 | /* read state transition probabilities */ 169 | bin = fmemopen(linebuf, buflen, "r"); 170 | if (bin == NULL) handle_error("fmemopen"); 171 | for (j = 0; j < nstates; j++) { 172 | if (fscanf(bin, "%f", &d) != 1) { 173 | fprintf(stderr, "config file format error: %d\n", i); 174 | freeall(); 175 | exit(EXIT_FAILURE); 176 | } 177 | trans[IDX((i - 3),j,nstates)] = logf(d); 178 | } 179 | fclose(bin); 180 | } else if (i <= 2 + nstates * 2) { 181 | /* read output probabilities */ 182 | bin = fmemopen(linebuf, buflen, "r"); 183 | if (bin == NULL) handle_error("fmemopen"); 184 | for (j = 0; j < nobvs; j++) { 185 | if (fscanf(bin, "%f", &d) != 1) { 186 | fprintf(stderr, "config file format error: %d\n", i); 187 | freeall(); 188 | exit(EXIT_FAILURE); 189 | } 190 | obvs[IDX((i - 3 - nstates),j,nobvs)] = logf(d); 191 | } 192 | fclose(bin); 193 | } else if (i == 3 + nstates * 2) { 194 | if (sscanf(linebuf, "%d %d", &nseq, &length) != 2) { 195 | fprintf(stderr, "config file format error: %d\n", i); 196 | freeall(); 197 | exit(EXIT_FAILURE); 198 | } 199 | data = (int *) malloc (sizeof(int) * nseq * length); 200 | if (data == NULL) handle_error("malloc"); 201 | } else if (i <= 3 + nstates * 2 + nseq) { 202 | /* read data */ 203 | bin = fmemopen(linebuf, buflen, "r"); 204 | if (bin == NULL) handle_error("fmemopen"); 205 | for (j = 0; j < length; j++) { 206 | if (fscanf(bin, "%d", &k) != 1 || k < 0 || k >= nobvs) { 207 | fprintf(stderr, "config file format error: %d\n", i); 208 | freeall(); 209 | exit(EXIT_FAILURE); 210 | } 211 | data[(i - 4 - nstates * 2) * length + j] = k; 212 | } 213 | fclose(bin); 214 | } 215 | 216 | i++; 217 | } 218 | fclose(fin); 219 | if (linebuf) free(linebuf); 220 | 221 | if (i < 4 + nstates * 2 + nseq) { 222 | fprintf(stderr, "configuration incomplete.\n"); 223 | freeall(); 224 | exit(EXIT_FAILURE); 225 | } 226 | 227 | if (mode == 3) { 228 | loglik = (float *) malloc(sizeof(float) * nseq); 229 | if (loglik == NULL) handle_error("malloc"); 230 | 231 | for (i = 0; i < iterations; i++) { 232 | init_count(); 233 | for (j = 0; j < nseq; j++) { 234 | loglik[j] = forward_backward(data + length * j, length, 1); 235 | } 236 | p = sumf(loglik, nseq); 237 | 238 | update_prob(); 239 | 240 | printf("iteration %d log-likelihood: %.4f\n", i + 1, p); 241 | printf("updated parameters:\n"); 242 | printf("# initial state probability\n"); 243 | for (j = 0; j < nstates; j++) { 244 | printf(" %.4f", exp(prior[j])); 245 | } 246 | printf("\n"); 247 | printf("# state transition probability\n"); 248 | for (j = 0; j < nstates; j++) { 249 | for (k = 0; k < nstates; k++) { 250 | printf(" %.4f", exp(trans[IDX(j,k,nstates)])); 251 | } 252 | printf("\n"); 253 | } 254 | printf("# state output probility\n"); 255 | for (j = 0; j < nstates; j++) { 256 | for (k = 0; k < nobvs; k++) { 257 | printf(" %.4f", exp(obvs[IDX(j,k,nobvs)])); 258 | } 259 | printf("\n"); 260 | } 261 | printf("\n"); 262 | } 263 | free(loglik); 264 | } else if (mode == 2) { 265 | for (i = 0; i < nseq; i++) { 266 | viterbi(data + length * i, length); 267 | } 268 | } else if (mode == 1) { 269 | loglik = (float *) malloc(sizeof(float) * nseq); 270 | if (loglik == NULL) handle_error("malloc"); 271 | for (i = 0; i < nseq; i++) { 272 | loglik[i] = forward_backward(data + length * i, length, 0); 273 | } 274 | p = sumf(loglik, nseq); 275 | 276 | for (i = 0; i < nseq; i++) 277 | printf("%.4f\n", loglik[i]); 278 | printf("total: %.4f\n", p); 279 | free(loglik); 280 | } 281 | 282 | freeall(); 283 | return 0; 284 | } 285 | 286 | /* compute sum of the array using Kahan summation algorithm */ 287 | float sumf(float *data, int size) 288 | { 289 | float sum = data[0]; 290 | int i; 291 | float y, t; 292 | float c = 0.0; 293 | for (i = 1; i < size; i++) { 294 | y = data[i] - c; 295 | t = sum + y; 296 | c = (t - sum) - y; 297 | sum = t; 298 | } 299 | return sum; 300 | } 301 | 302 | /* initilize counts */ 303 | void init_count() { 304 | size_t i; 305 | for (i = 0; i < nstates * nobvs; i++) 306 | gmm[i] = - INFINITY; 307 | 308 | for (i = 0; i < nstates * nstates; i++) 309 | xi[i] = - INFINITY; 310 | 311 | for (i = 0; i < nstates; i++) 312 | pi[i] = - INFINITY; 313 | } 314 | 315 | void update_prob() { 316 | float pisum = - INFINITY; 317 | float gmmsum[nstates]; 318 | float xisum[nstates]; 319 | size_t i, j; 320 | 321 | for (i = 0; i < nstates; i++) { 322 | gmmsum[i] = - INFINITY; 323 | xisum[i] = - INFINITY; 324 | 325 | pisum = logadd(pi[i], pisum); 326 | } 327 | 328 | for (i = 0; i < nstates; i++) { 329 | prior[i] = pi[i] - pisum; 330 | } 331 | 332 | for (i = 0; i < nstates; i++) { 333 | for (j = 0; j < nstates; j++) { 334 | xisum[i] = logadd(xisum[i], xi[IDX(i,j,nstates)]); 335 | } 336 | for (j = 0; j < nobvs; j++) { 337 | gmmsum[i] = logadd(gmmsum[i], gmm[IDX(i,j,nobvs)]); 338 | } 339 | } 340 | 341 | for (i = 0; i < nstates; i++) { 342 | for (j = 0; j < nstates; j++) { 343 | trans[IDX(i,j,nstates)] = xi[IDX(i,j,nstates)] - xisum[i]; 344 | } 345 | for (j = 0; j < nobvs; j++) { 346 | obvs[IDX(i,j,nobvs)] = gmm[IDX(i,j,nobvs)] - gmmsum[i]; 347 | } 348 | } 349 | 350 | } 351 | 352 | /* forward backward algoritm: return observation likelihood */ 353 | float forward_backward(int *data, size_t len, int backward) 354 | { 355 | /* construct trellis */ 356 | float alpha[len][nstates]; 357 | float beta[len][nstates]; 358 | 359 | size_t i, j, k; 360 | float p, e; 361 | float loglik; 362 | 363 | for (i = 0; i < len; i++) { 364 | for (j = 0; j < nstates; j++) { 365 | alpha[i][j] = - INFINITY; 366 | beta[i][j] = - INFINITY; 367 | } 368 | } 369 | 370 | /* forward pass */ 371 | for (i = 0; i < nstates; i++) { 372 | alpha[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)]; 373 | } 374 | for (i = 1; i < len; i++) { 375 | for (j = 0; j < nstates; j++) { 376 | for (k = 0; k < nstates; k++) { 377 | p = alpha[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)]; 378 | alpha[i][j] = logadd(alpha[i][j], p); 379 | } 380 | } 381 | } 382 | loglik = -INFINITY; 383 | for (i = 0; i < nstates; i++) { 384 | loglik = logadd(loglik, alpha[len-1][i]); 385 | } 386 | 387 | if (! backward) 388 | return loglik; 389 | 390 | /* backward pass & update counts */ 391 | for (i = 0; i < nstates; i++) { 392 | beta[len-1][i] = 0; /* 0 = log (1.0) */ 393 | } 394 | for (i = 1; i < len; i++) { 395 | for (j = 0; j < nstates; j++) { 396 | 397 | e = alpha[len-i][j] + beta[len-i][j] - loglik; 398 | gmm[IDX(j,data[len-i],nobvs)] = logadd(gmm[IDX(j,data[len-i],nobvs)], e); 399 | 400 | for (k = 0; k < nstates; k++) { 401 | p = beta[len-i][k] + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)]; 402 | beta[len-1-i][j] = logadd(beta[len-1-i][j], p); 403 | 404 | e = alpha[len-1-i][j] + beta[len-i][k] 405 | + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)] - loglik; 406 | xi[IDX(j,k,nstates)] = logadd(xi[IDX(j,k,nstates)], e); 407 | } 408 | } 409 | } 410 | p = -INFINITY; 411 | for (i = 0; i < nstates; i++) { 412 | p = logadd(p, prior[i] + beta[0][i] + obvs[IDX(i,data[0],nobvs)]); 413 | 414 | e = alpha[0][i] + beta[0][i] - loglik; 415 | gmm[IDX(i,data[0],nobvs)] = logadd(gmm[IDX(i,data[0],nobvs)], e); 416 | 417 | pi[i] = logadd(pi[i], e); 418 | } 419 | 420 | #ifdef DEBUG 421 | /* verify if forward prob == backward prob */ 422 | if (fabs(p - loglik) > 1e-3) { 423 | fprintf(stderr, "Error: forward and backward incompatible: %f, %f\n", loglik, p); 424 | } 425 | #endif 426 | 427 | return loglik; 428 | } 429 | 430 | /* find the most probable sequence */ 431 | void viterbi(int *data, size_t len) 432 | { 433 | float lambda[len][nstates]; 434 | int backtrace[len][nstates]; 435 | int stack[len]; 436 | 437 | size_t i, j, k; 438 | float p; 439 | 440 | for (i = 0; i < len; i++) { 441 | for (j = 0; j < nstates; j++) { 442 | lambda[i][j] = - INFINITY; 443 | } 444 | } 445 | 446 | for (i = 0; i < nstates; i++) { 447 | lambda[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)]; 448 | backtrace[0][i] = -1; /* -1 is starting point */ 449 | } 450 | for (i = 1; i < len; i++) { 451 | for (j = 0; j < nstates; j++) { 452 | for (k = 0; k < nstates; k++) { 453 | p = lambda[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)]; 454 | if (p > lambda[i][j]) { 455 | lambda[i][j] = p; 456 | backtrace[i][j] = k; 457 | } 458 | } 459 | } 460 | } 461 | 462 | /* backtrace */ 463 | for (i = 0; i < nstates; i++) { 464 | if (i == 0 || lambda[len-1][i] > p) { 465 | p = lambda[len-1][i]; 466 | k = i; 467 | } 468 | } 469 | stack[len - 1] = k; 470 | for (i = 1; i < len; i++) { 471 | stack[len - 1 - i] = backtrace[len - i][stack[len - i]]; 472 | } 473 | for (i = 0; i < len; i++) { 474 | printf("%d ", stack[i]); 475 | } 476 | printf("\n"); 477 | } 478 | 479 | float logadd(float x, float y) { 480 | if (y <= x) 481 | return x + log1pf(expf(y - x)); 482 | else 483 | return y + log1pf(expf(x - y)); 484 | } 485 | 486 | void usage() { 487 | fprintf(stdout, "hmm [-hnt] [-c config] [-p(1|2|3)]\n"); 488 | fprintf(stdout, "usage:\n"); 489 | fprintf(stdout, " -h help\n"); 490 | fprintf(stdout, " -c configuration file\n"); 491 | fprintf(stdout, " -t output computation time\n"); 492 | fprintf(stdout, " -p1 compute the probability of the observation sequence\n"); 493 | fprintf(stdout, " -p2 compute the most probable sequence (Viterbi)\n"); 494 | fprintf(stdout, " -p3 train hidden Markov mode parameters (Baum-Welch)\n"); 495 | fprintf(stdout, " -n number of iterations\n"); 496 | } 497 | 498 | /* free all memory */ 499 | void freeall() { 500 | if (trans) free(trans); 501 | if (obvs) free(obvs); 502 | if (prior) free(prior); 503 | if (data) free(data); 504 | if (gmm) free(gmm); 505 | if (xi) free(xi); 506 | if (pi) free(pi); 507 | } 508 | -------------------------------------------------------------------------------- /hmm.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, Chuan Liu 3 | * 4 | * Permission is hereby granted, free of charge, to any person 5 | * obtaining a copy of this software and associated documentation 6 | * files (the "Software"), to deal in the Software without 7 | * restriction, including without limitation the rights to use, copy, 8 | * modify, merge, publish, distribute, sublicense, and/or sell copies 9 | * of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | * 24 | */ 25 | 26 | 27 | #ifndef _GNU_SOURCE 28 | #define _GNU_SOURCE 29 | #endif 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #define handle_error(msg) \ 37 | do { perror(msg); exit(EXIT_FAILURE); } while (0) 38 | 39 | #define IDX(i,j,d) (((i)*(d))+(j)) 40 | 41 | 42 | int nstates = 0; /* number of states */ 43 | int nobvs = 0; /* number of observations */ 44 | int nseq = 0; /* number of data sequences */ 45 | int length = 0; /* data sequencel length */ 46 | double *prior = NULL; /* initial state probabilities */ 47 | double *trans = NULL; /* state transition probabilities */ 48 | double *obvs = NULL; /* output probabilities */ 49 | int *data = NULL; 50 | double *gmm = NULL; /* gamma */ 51 | double *xi = NULL; /* xi */ 52 | double *pi = NULL; /* pi */ 53 | 54 | double logadd(double, double); 55 | double sum(double *, int); 56 | double forward_backward(int *, size_t, int); 57 | void viterbi(int *, size_t); 58 | void init_count(); 59 | void update_prob(); 60 | void usage(); 61 | void freeall(); 62 | 63 | int main(int argc, char *argv[]) 64 | { 65 | char *configfile = NULL; 66 | FILE *fin, *bin; 67 | 68 | char *linebuf = NULL; 69 | size_t buflen = 0; 70 | 71 | int iterations = 3; 72 | int mode = 3; 73 | 74 | int c; 75 | double d; 76 | double *loglik; 77 | double p; 78 | int i, j, k; 79 | opterr = 0; 80 | 81 | 82 | while ((c = getopt(argc, argv, "c:n:hp:")) != -1) { 83 | switch (c) { 84 | case 'c': 85 | configfile = optarg; 86 | break; 87 | case 'h': 88 | usage(); 89 | exit(EXIT_SUCCESS); 90 | case 'n': 91 | iterations = atoi(optarg); 92 | break; 93 | case 'p': 94 | mode = atoi(optarg); 95 | if (mode != 1 && mode != 2 && mode != 3) { 96 | fprintf(stderr, "illegal mode: %d\n", mode); 97 | exit(EXIT_FAILURE); 98 | } 99 | break; 100 | case '?': 101 | fprintf(stderr, "illegal options\n"); 102 | exit(EXIT_FAILURE); 103 | default: 104 | abort(); 105 | } 106 | } 107 | 108 | if (configfile == NULL) { 109 | fin = stdin; 110 | } else { 111 | fin = fopen(configfile, "r"); 112 | if (fin == NULL) { 113 | handle_error("fopen"); 114 | } 115 | } 116 | 117 | i = 0; 118 | while ((c = getline(&linebuf, &buflen, fin)) != -1) { 119 | if (c <= 1 || linebuf[0] == '#') 120 | continue; 121 | 122 | if (i == 0) { 123 | if (sscanf(linebuf, "%d", &nstates) != 1) { 124 | fprintf(stderr, "config file format error: %d\n", i); 125 | freeall(); 126 | exit(EXIT_FAILURE); 127 | } 128 | 129 | prior = (double *) malloc(sizeof(double) * nstates); 130 | if (prior == NULL) handle_error("malloc"); 131 | 132 | trans = (double *) malloc(sizeof(double) * nstates * nstates); 133 | if (trans == NULL) handle_error("malloc"); 134 | 135 | xi = (double *) malloc(sizeof(double) * nstates * nstates); 136 | if (xi == NULL) handle_error("malloc"); 137 | 138 | pi = (double *) malloc(sizeof(double) * nstates); 139 | if (pi == NULL) handle_error("malloc"); 140 | 141 | } else if (i == 1) { 142 | if (sscanf(linebuf, "%d", &nobvs) != 1) { 143 | fprintf(stderr, "config file format error: %d\n", i); 144 | freeall(); 145 | exit(EXIT_FAILURE); 146 | } 147 | 148 | obvs = (double *) malloc(sizeof(double) * nstates * nobvs); 149 | if (obvs == NULL) handle_error("malloc"); 150 | 151 | gmm = (double *) malloc(sizeof(double) * nstates * nobvs); 152 | if (gmm == NULL) handle_error("malloc"); 153 | 154 | } else if (i == 2) { 155 | /* read initial state probabilities */ 156 | bin = fmemopen(linebuf, buflen, "r"); 157 | if (bin == NULL) handle_error("fmemopen"); 158 | for (j = 0; j < nstates; j++) { 159 | if (fscanf(bin, "%lf", &d) != 1) { 160 | fprintf(stderr, "config file format error: %d\n", i); 161 | freeall(); 162 | exit(EXIT_FAILURE); 163 | } 164 | prior[j] = log(d); 165 | } 166 | fclose(bin); 167 | 168 | } else if (i <= 2 + nstates) { 169 | /* read state transition probabilities */ 170 | bin = fmemopen(linebuf, buflen, "r"); 171 | if (bin == NULL) handle_error("fmemopen"); 172 | for (j = 0; j < nstates; j++) { 173 | if (fscanf(bin, "%lf", &d) != 1) { 174 | fprintf(stderr, "config file format error: %d\n", i); 175 | freeall(); 176 | exit(EXIT_FAILURE); 177 | } 178 | trans[IDX((i - 3),j, nstates)] = log(d); 179 | } 180 | fclose(bin); 181 | } else if (i <= 2 + nstates * 2) { 182 | /* read output probabilities */ 183 | bin = fmemopen(linebuf, buflen, "r"); 184 | if (bin == NULL) handle_error("fmemopen"); 185 | for (j = 0; j < nobvs; j++) { 186 | if (fscanf(bin, "%lf", &d) != 1) { 187 | fprintf(stderr, "config file format error: %d\n", i); 188 | freeall(); 189 | exit(EXIT_FAILURE); 190 | } 191 | obvs[IDX((i - 3 - nstates),j,nobvs)] = log(d); 192 | } 193 | fclose(bin); 194 | } else if (i == 3 + nstates * 2) { 195 | if (sscanf(linebuf, "%d %d", &nseq, &length) != 2) { 196 | fprintf(stderr, "config file format error: %d\n", i); 197 | freeall(); 198 | exit(EXIT_FAILURE); 199 | } 200 | data = (int *) malloc (sizeof(int) * nseq * length); 201 | if (data == NULL) handle_error("malloc"); 202 | } else if (i <= 3 + nstates * 2 + nseq) { 203 | /* read data */ 204 | bin = fmemopen(linebuf, buflen, "r"); 205 | if (bin == NULL) handle_error("fmemopen"); 206 | for (j = 0; j < length; j++) { 207 | if (fscanf(bin, "%d", &k) != 1 || k < 0 || k >= nobvs) { 208 | fprintf(stderr, "config file format error: %d\n", i); 209 | freeall(); 210 | exit(EXIT_FAILURE); 211 | } 212 | data[(i - 4 - nstates * 2) * length + j] = k; 213 | } 214 | fclose(bin); 215 | } 216 | 217 | i++; 218 | } 219 | fclose(fin); 220 | if (linebuf) free(linebuf); 221 | 222 | if (i < 4 + nstates * 2 + nseq) { 223 | fprintf(stderr, "configuration incomplete.\n"); 224 | freeall(); 225 | exit(EXIT_FAILURE); 226 | } 227 | 228 | if (mode == 3) { 229 | loglik = (double *) malloc(sizeof(double) * nseq); 230 | if (loglik == NULL) handle_error("malloc"); 231 | for (i = 0; i < iterations; i++) { 232 | init_count(); 233 | for (j = 0; j < nseq; j++) { 234 | loglik[j] = forward_backward(data + length * j, length, 1); 235 | } 236 | p = sum(loglik, nseq); 237 | 238 | update_prob(); 239 | 240 | printf("iteration %d log-likelihood: %.4lf\n", i + 1, p); 241 | printf("updated parameters:\n"); 242 | printf("# initial state probability\n"); 243 | for (j = 0; j < nstates; j++) { 244 | printf(" %.4f", exp(prior[j])); 245 | } 246 | printf("\n"); 247 | printf("# state transition probability\n"); 248 | for (j = 0; j < nstates; j++) { 249 | for (k = 0; k < nstates; k++) { 250 | printf(" %.4f", exp(trans[IDX(j,k,nstates)])); 251 | } 252 | printf("\n"); 253 | } 254 | printf("# state output probility\n"); 255 | for (j = 0; j < nstates; j++) { 256 | for (k = 0; k < nobvs; k++) { 257 | printf(" %.4f", exp(obvs[IDX(j,k,nobvs)])); 258 | } 259 | printf("\n"); 260 | } 261 | printf("\n"); 262 | } 263 | free(loglik); 264 | } else if (mode == 2) { 265 | for (i = 0; i < nseq; i++) { 266 | viterbi(data + length * i, length); 267 | } 268 | } else if (mode == 1) { 269 | loglik = (double *) malloc(sizeof(double) * nseq); 270 | if (loglik == NULL) handle_error("malloc"); 271 | for (i = 0; i < nseq; i++) { 272 | loglik[i] = forward_backward(data + length * i, length, 0); 273 | } 274 | p = sum(loglik, nseq); 275 | for (i = 0; i < nseq; i++) 276 | printf("%.4lf\n", loglik[i]); 277 | printf("total: %.4lf\n", p); 278 | free(loglik); 279 | } 280 | 281 | freeall(); 282 | return 0; 283 | } 284 | 285 | /* compute sum of the array using Kahan summation algorithm */ 286 | double sum(double *data, int size) 287 | { 288 | double sum = data[0]; 289 | int i; 290 | double y, t; 291 | double c = 0.0; 292 | for (i = 1; i < size; i++) { 293 | y = data[i] - c; 294 | t = sum + y; 295 | c = (t - sum) - y; 296 | sum = t; 297 | } 298 | return sum; 299 | } 300 | 301 | /* initilize counts */ 302 | void init_count() { 303 | size_t i; 304 | for (i = 0; i < nstates * nobvs; i++) 305 | gmm[i] = - INFINITY; 306 | 307 | for (i = 0; i < nstates * nstates; i++) 308 | xi[i] = - INFINITY; 309 | 310 | for (i = 0; i < nstates; i++) 311 | pi[i] = - INFINITY; 312 | } 313 | 314 | void update_prob() { 315 | double pisum = - INFINITY; 316 | double gmmsum[nstates]; 317 | double xisum[nstates]; 318 | size_t i, j; 319 | 320 | for (i = 0; i < nstates; i++) { 321 | gmmsum[i] = - INFINITY; 322 | xisum[i] = - INFINITY; 323 | 324 | pisum = logadd(pi[i], pisum); 325 | } 326 | 327 | for (i = 0; i < nstates; i++) { 328 | prior[i] = pi[i] - pisum; 329 | } 330 | 331 | for (i = 0; i < nstates; i++) { 332 | for (j = 0; j < nstates; j++) { 333 | xisum[i] = logadd(xisum[i], xi[IDX(i,j,nstates)]); 334 | } 335 | for (j = 0; j < nobvs; j++) { 336 | gmmsum[i] = logadd(gmmsum[i], gmm[IDX(i,j,nobvs)]); 337 | } 338 | } 339 | 340 | for (i = 0; i < nstates; i++) { 341 | for (j = 0; j < nstates; j++) { 342 | trans[IDX(i,j,nstates)] = xi[IDX(i,j,nstates)] - xisum[i]; 343 | } 344 | for (j = 0; j < nobvs; j++) { 345 | obvs[IDX(i,j,nobvs)] = gmm[IDX(i,j,nobvs)] - gmmsum[i]; 346 | } 347 | } 348 | 349 | } 350 | 351 | /* forward backward algoritm: return observation likelihood */ 352 | double forward_backward(int *data, size_t len, int backward) 353 | { 354 | /* construct trellis */ 355 | double alpha[len][nstates]; 356 | double beta[len][nstates]; 357 | 358 | size_t i, j, k; 359 | double p, e; 360 | double loglik; 361 | 362 | for (i = 0; i < len; i++) { 363 | for (j = 0; j < nstates; j++) { 364 | alpha[i][j] = - INFINITY; 365 | beta[i][j] = - INFINITY; 366 | } 367 | } 368 | 369 | /* forward pass */ 370 | for (i = 0; i < nstates; i++) { 371 | alpha[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)]; 372 | } 373 | for (i = 1; i < len; i++) { 374 | for (j = 0; j < nstates; j++) { 375 | for (k = 0; k < nstates; k++) { 376 | p = alpha[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)]; 377 | alpha[i][j] = logadd(alpha[i][j], p); 378 | } 379 | } 380 | } 381 | loglik = -INFINITY; 382 | for (i = 0; i < nstates; i++) { 383 | loglik = logadd(loglik, alpha[len-1][i]); 384 | } 385 | 386 | if (! backward) 387 | return loglik; 388 | 389 | /* backward pass & update counts */ 390 | for (i = 0; i < nstates; i++) { 391 | beta[len-1][i] = 0; /* 0 = log (1.0) */ 392 | } 393 | for (i = 1; i < len; i++) { 394 | for (j = 0; j < nstates; j++) { 395 | 396 | e = alpha[len-i][j] + beta[len-i][j] - loglik; 397 | gmm[IDX(j,data[len-i],nobvs)] = logadd(gmm[IDX(j,data[len-i],nobvs)], e); 398 | 399 | for (k = 0; k < nstates; k++) { 400 | p = beta[len-i][k] + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)]; 401 | beta[len-1-i][j] = logadd(beta[len-1-i][j], p); 402 | 403 | e = alpha[len-1-i][j] + beta[len-i][k] 404 | + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)] - loglik; 405 | xi[IDX(j,k,nstates)] = logadd(xi[IDX(j,k,nstates)], e); 406 | } 407 | } 408 | } 409 | p = -INFINITY; 410 | for (i = 0; i < nstates; i++) { 411 | p = logadd(p, prior[i] + beta[0][i] + obvs[IDX(i,data[0],nobvs)]); 412 | 413 | e = alpha[0][i] + beta[0][i] - loglik; 414 | gmm[IDX(i,data[0],nobvs)] = logadd(gmm[IDX(i,data[0],nobvs)], e); 415 | 416 | pi[i] = logadd(pi[i], e); 417 | } 418 | 419 | #ifdef DEBUG 420 | /* verify if forward prob == backward prob */ 421 | if (fabs(p - loglik) > 1e-5) { 422 | fprintf(stderr, "Error: forward and backward incompatible: %lf, %lf\n", loglik, p); 423 | } 424 | #endif 425 | 426 | return loglik; 427 | } 428 | 429 | /* find the most probable sequence */ 430 | void viterbi(int *data, size_t len) 431 | { 432 | double lambda[len][nstates]; 433 | int backtrace[len][nstates]; 434 | int stack[len]; 435 | 436 | size_t i, j, k; 437 | double p; 438 | 439 | for (i = 0; i < len; i++) { 440 | for (j = 0; j < nstates; j++) { 441 | lambda[i][j] = - INFINITY; 442 | } 443 | } 444 | 445 | for (i = 0; i < nstates; i++) { 446 | lambda[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)]; 447 | backtrace[0][i] = -1; /* -1 is starting point */ 448 | } 449 | for (i = 1; i < len; i++) { 450 | for (j = 0; j < nstates; j++) { 451 | for (k = 0; k < nstates; k++) { 452 | p = lambda[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)]; 453 | if (p > lambda[i][j]) { 454 | lambda[i][j] = p; 455 | backtrace[i][j] = k; 456 | } 457 | } 458 | } 459 | } 460 | 461 | /* backtrace */ 462 | for (i = 0; i < nstates; i++) { 463 | if (i == 0 || lambda[len-1][i] > p) { 464 | p = lambda[len-1][i]; 465 | k = i; 466 | } 467 | } 468 | stack[len - 1] = k; 469 | for (i = 1; i < len; i++) { 470 | stack[len - 1 - i] = backtrace[len - i][stack[len - i]]; 471 | } 472 | for (i = 0; i < len; i++) { 473 | printf("%d ", stack[i]); 474 | } 475 | printf("\n"); 476 | } 477 | 478 | double logadd(double x, double y) { 479 | if (y <= x) 480 | return x + log1p(exp(y - x)); 481 | else 482 | return y + log1p(exp(x - y)); 483 | } 484 | 485 | void usage() { 486 | fprintf(stdout, "hmm [-hnt] [-c config] [-p(1|2|3)]\n"); 487 | fprintf(stdout, "usage:\n"); 488 | fprintf(stdout, " -h help\n"); 489 | fprintf(stdout, " -c configuration file\n"); 490 | fprintf(stdout, " -t output computation time\n"); 491 | fprintf(stdout, " -p1 compute the probability of the observation sequence\n"); 492 | fprintf(stdout, " -p2 compute the most probable sequence (Viterbi)\n"); 493 | fprintf(stdout, " -p3 train hidden Markov mode parameters (Baum-Welch)\n"); 494 | fprintf(stdout, " -n number of iterations\n"); 495 | } 496 | 497 | /* free all memory */ 498 | void freeall() { 499 | if (trans) free(trans); 500 | if (obvs) free(obvs); 501 | if (prior) free(prior); 502 | if (data) free(data); 503 | if (gmm) free(gmm); 504 | if (xi) free(xi); 505 | if (pi) free(pi); 506 | } 507 | -------------------------------------------------------------------------------- /hmm.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, Chuan Liu 3 | * 4 | * Permission is hereby granted, free of charge, to any person 5 | * obtaining a copy of this software and associated documentation 6 | * files (the "Software"), to deal in the Software without 7 | * restriction, including without limitation the rights to use, copy, 8 | * modify, merge, publish, distribute, sublicense, and/or sell copies 9 | * of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | * 24 | */ 25 | 26 | #ifndef _GNU_SOURCE 27 | #define _GNU_SOURCE 28 | #endif 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | 39 | 40 | #define handle_error(msg) \ 41 | do { perror(msg); exit(EXIT_FAILURE); } while (0) 42 | 43 | #define IDX(i,j,d) (((i)*(d))+(j)) 44 | 45 | enum { 46 | BLOCK_SIZE = 16, 47 | NUM_THREADS = 256, 48 | IN_DEVICE = 1, 49 | IN_HOST = 0, 50 | }; 51 | 52 | int nstates = 0; /* number of states */ 53 | int nobvs = 0; /* number of observations */ 54 | int nseq = 0; /* number of data sequences */ 55 | int length = 0; /* data sequencel length */ 56 | float *prior = NULL; /* initial state probabilities */ 57 | float *trans = NULL; /* state transition probabilities */ 58 | float *obvs = NULL; /* output probabilities */ 59 | int *data = NULL; /* observations */ 60 | float *transd = NULL; /* trans in device memory */ 61 | float *obvsd = NULL; /* obvs in device memory */ 62 | float *gmmd = NULL; /* gamma in device memory */ 63 | float *xid = NULL; /* xi in device memory */ 64 | float *pid = NULL; /* pi in device memory */ 65 | 66 | #ifdef PROFILE_GPU 67 | StopWatchInterface *gpu_timer; 68 | double gpu_flop; 69 | float gpu_time; 70 | #endif 71 | 72 | #ifdef PROFILE_PG 73 | StopWatchInterface *pg_timer; 74 | #endif 75 | 76 | /* function called in main fuction */ 77 | void usage(); 78 | void freeall(); 79 | void init_count(); 80 | float forward_backward(int backward); 81 | void viterbi(); 82 | void update_prob(); 83 | 84 | /* utility functions */ 85 | float logadd(float, float); 86 | __device__ float logaddd(float, float); 87 | float sumf(float *, int, int); 88 | float logsumf(float *, int, int); 89 | 90 | 91 | int main(int argc, char *argv[]) 92 | { 93 | char *configfile = NULL; 94 | FILE *fin, *bin; 95 | 96 | char *linebuf = NULL; 97 | size_t buflen = 0; 98 | 99 | int iterations = 1; 100 | int mode = 3; 101 | 102 | int c; 103 | float d; 104 | 105 | int i, j, k; 106 | opterr = 0; 107 | 108 | 109 | while ((c = getopt(argc, argv, "c:n:hp:")) != -1) { 110 | switch (c) { 111 | case 'c': 112 | configfile = optarg; 113 | break; 114 | case 'h': 115 | usage(); 116 | exit(EXIT_SUCCESS); 117 | case 'n': 118 | iterations = atoi(optarg); 119 | break; 120 | case 'p': 121 | mode = atoi(optarg); 122 | if (mode != 1 && mode != 2 && mode != 3) { 123 | fprintf(stderr, "illegal mode: %d\n", mode); 124 | exit(EXIT_FAILURE); 125 | } 126 | break; 127 | case '?': 128 | fprintf(stderr, "illegal options\n"); 129 | exit(EXIT_FAILURE); 130 | default: 131 | abort(); 132 | } 133 | } 134 | 135 | if (configfile == NULL) { 136 | fin = stdin; 137 | } else { 138 | fin = fopen(configfile, "r"); 139 | if (fin == NULL) { 140 | handle_error("fopen"); 141 | } 142 | } 143 | 144 | i = 0; 145 | while ((c = getline(&linebuf, &buflen, fin)) != -1) { 146 | if (c <= 1 || linebuf[0] == '#') 147 | continue; 148 | 149 | if (i == 0) { 150 | if (sscanf(linebuf, "%d", &nstates) != 1) { 151 | fprintf(stderr, "config file format error: %d\n", i); 152 | freeall(); 153 | exit(EXIT_FAILURE); 154 | } 155 | 156 | prior = (float *) malloc(sizeof(float) * nstates); 157 | if (prior == NULL) handle_error("malloc"); 158 | 159 | trans = (float *) malloc(sizeof(float) * nstates * nstates); 160 | if (trans == NULL) handle_error("malloc"); 161 | 162 | } else if (i == 1) { 163 | if (sscanf(linebuf, "%d", &nobvs) != 1) { 164 | fprintf(stderr, "config file format error: %d\n", i); 165 | freeall(); 166 | exit(EXIT_FAILURE); 167 | } 168 | 169 | obvs = (float *) malloc(sizeof(float) * nstates * nobvs); 170 | if (obvs == NULL) handle_error("malloc"); 171 | 172 | } else if (i == 2) { 173 | /* read initial state probabilities */ 174 | bin = fmemopen(linebuf, buflen, "r"); 175 | if (bin == NULL) handle_error("fmemopen"); 176 | for (j = 0; j < nstates; j++) { 177 | if (fscanf(bin, "%f", &d) != 1) { 178 | fprintf(stderr, "config file format error: %d\n", i); 179 | freeall(); 180 | exit(EXIT_FAILURE); 181 | } 182 | prior[j] = logf(d); 183 | } 184 | fclose(bin); 185 | 186 | } else if (i <= 2 + nstates) { 187 | /* read state transition probabilities */ 188 | bin = fmemopen(linebuf, buflen, "r"); 189 | if (bin == NULL) handle_error("fmemopen"); 190 | for (j = 0; j < nstates; j++) { 191 | if (fscanf(bin, "%f", &d) != 1) { 192 | fprintf(stderr, "config file format error: %d\n", i); 193 | freeall(); 194 | exit(EXIT_FAILURE); 195 | } 196 | trans[IDX((i - 3), j, nstates)] = logf(d); 197 | } 198 | fclose(bin); 199 | } else if (i <= 2 + nstates * 2) { 200 | /* read output probabilities */ 201 | bin = fmemopen(linebuf, buflen, "r"); 202 | if (bin == NULL) handle_error("fmemopen"); 203 | for (j = 0; j < nobvs; j++) { 204 | if (fscanf(bin, "%f", &d) != 1) { 205 | fprintf(stderr, "config file format error: %d\n", i); 206 | freeall(); 207 | exit(EXIT_FAILURE); 208 | } 209 | obvs[IDX((i - 3 - nstates), j, nobvs)] = logf(d); 210 | } 211 | fclose(bin); 212 | } else if (i == 3 + nstates * 2) { 213 | if (sscanf(linebuf, "%d %d", &nseq, &length) != 2) { 214 | fprintf(stderr, "config file format error: %d\n", i); 215 | freeall(); 216 | exit(EXIT_FAILURE); 217 | } 218 | data = (int *) malloc (sizeof(int) * nseq * length); 219 | if (data == NULL) handle_error("malloc"); 220 | } else if (i <= 3 + nstates * 2 + nseq) { 221 | /* read data */ 222 | bin = fmemopen(linebuf, buflen, "r"); 223 | if (bin == NULL) handle_error("fmemopen"); 224 | for (j = 0; j < length; j++) { 225 | if (fscanf(bin, "%d", &k) != 1 || k < 0 || k >= nobvs) { 226 | fprintf(stderr, "config file format error: %d\n", i); 227 | freeall(); 228 | exit(EXIT_FAILURE); 229 | } 230 | data[j * nseq + (i - 4 - nstates * 2)] = k; 231 | } 232 | fclose(bin); 233 | } 234 | 235 | i++; 236 | } 237 | fclose(fin); 238 | if (linebuf) free(linebuf); 239 | 240 | if (i < 4 + nstates * 2 + nseq) { 241 | fprintf(stderr, "configuration incomplete.\n"); 242 | freeall(); 243 | exit(EXIT_FAILURE); 244 | } 245 | 246 | 247 | /* initial cuda device memory */ 248 | c = sizeof(float) * nstates * nstates; 249 | checkCudaErrors( cudaMalloc((void**)&transd, c) ); 250 | checkCudaErrors( cudaMemcpy(transd, trans, c, cudaMemcpyHostToDevice) ); 251 | 252 | c = sizeof(float) * nstates * nobvs; 253 | checkCudaErrors( cudaMalloc((void**)&obvsd, c) ); 254 | checkCudaErrors( cudaMemcpy(obvsd, obvs, c, cudaMemcpyHostToDevice) ); 255 | 256 | #ifdef PROFILE_GPU 257 | gpu_time = 0; 258 | gpu_flop = 0; 259 | #endif 260 | 261 | #ifdef PROFILE_PG 262 | sdkCreateTimer( &pg_timer ); 263 | sdkStartTimer(&pg_timer); 264 | #endif 265 | 266 | if (mode == 3) { 267 | /* estimating parameters using Baum-Welch algorithm */ 268 | for (i = 0; i < iterations; i++) { 269 | init_count(); 270 | d = forward_backward(1); 271 | update_prob(); 272 | 273 | #ifdef PROFILE_PG 274 | sdkStopTimer(&pg_timer); 275 | #endif 276 | 277 | printf("iteration %d log-likelihood: %.4f\n", i + 1, d); 278 | printf("updated parameters:\n"); 279 | printf("# initial state probability\n"); 280 | for (j = 0; j < nstates; j++) { 281 | printf(" %.4f", exp(prior[j])); 282 | } 283 | printf("\n"); 284 | printf("# state transition probability\n"); 285 | for (j = 0; j < nstates; j++) { 286 | for (k = 0; k < nstates; k++) { 287 | printf(" %.4f", exp(trans[IDX(j,k,nstates)])); 288 | } 289 | printf("\n"); 290 | } 291 | printf("# state output probility\n"); 292 | for (j = 0; j < nstates; j++) { 293 | for (k = 0; k < nobvs; k++) { 294 | printf(" %.4f", exp(obvs[IDX(j,k,nobvs)])); 295 | } 296 | printf("\n"); 297 | } 298 | printf("\n"); 299 | } 300 | 301 | #ifdef PROFILE_PG 302 | sdkStartTimer(&pg_timer); 303 | #endif 304 | 305 | } else if (mode == 1) { 306 | /* compute forward probabilities */ 307 | forward_backward(0); 308 | } else if (mode == 2) { 309 | /* find most likely path using Viterbi algorithm */ 310 | viterbi(); 311 | } 312 | 313 | freeall(); 314 | 315 | #ifdef PROFILE_PG 316 | sdkStopTimer(&pg_timer); 317 | printf("Programming running time (in Ms): %f\n", sdkGetTimerValue(&pg_timer)); 318 | sdkDeleteTimer( &pg_timer); 319 | #endif 320 | 321 | #ifdef PROFILE_GPU 322 | printf("GPU time (in Ms): %f\n", gpu_time); 323 | printf("GFLOPS: %lf\n", gpu_flop / gpu_time); 324 | #endif 325 | return 0; 326 | } 327 | 328 | 329 | /* kernel function copied from NVIDIA CUDA SDK */ 330 | __global__ void 331 | reduce2(float *g_idata, float *g_odata) 332 | { 333 | __shared__ float sdata[NUM_THREADS]; 334 | 335 | /* load shared mem */ 336 | unsigned int tid = threadIdx.x; 337 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 338 | sdata[tid] = g_idata[i]; 339 | __syncthreads(); 340 | 341 | /* do reduction in shared mem */ 342 | for(unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 343 | { 344 | if (tid < s) 345 | { 346 | sdata[tid] += sdata[tid + s]; 347 | } 348 | __syncthreads(); 349 | } 350 | 351 | /* write result for this block to global mem */ 352 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 353 | } 354 | 355 | /* calculate the sum of the array of n floats by calling a reduce 356 | kernel recursively. indevce indicates whether the array points to 357 | memroy in device or not, i.e. indevice = 0 means the array is in 358 | main memory. */ 359 | float sumf(float *array, int n, int indevice) 360 | { 361 | int i; 362 | int num_blocks; 363 | int remains; 364 | int size; 365 | float sum = 0.0; 366 | float *gin; 367 | 368 | /* NUM_THREADS also serves as CPU threshold */ 369 | if (n < NUM_THREADS) { 370 | if (indevice == 0) { 371 | for (i = 0; i < n; i++) { 372 | sum += array[i]; 373 | } 374 | } else { 375 | float gout[n]; 376 | size = sizeof(float) * n; 377 | checkCudaErrors( cudaMemcpy(gout, array, size, cudaMemcpyDeviceToHost) ); 378 | for (i = 0; i < n; i++) { 379 | sum += gout[i]; 380 | } 381 | } 382 | } else { 383 | 384 | num_blocks = n / NUM_THREADS; 385 | remains = n - num_blocks * NUM_THREADS; 386 | 387 | dim3 dimBlock(NUM_THREADS); 388 | dim3 dimGrid(num_blocks); 389 | 390 | if (indevice == 0) { 391 | 392 | size = sizeof(float) * num_blocks * NUM_THREADS; 393 | checkCudaErrors( cudaMalloc((void**) &gin, size) ); 394 | checkCudaErrors( cudaMemcpy(gin, array, size, cudaMemcpyHostToDevice) ); 395 | 396 | reduce2<<>>(gin, gin); 397 | 398 | sum += sumf(gin, num_blocks, 1); 399 | 400 | if (remains > 0) 401 | sum += sumf(array + num_blocks * NUM_THREADS, remains, 0); 402 | 403 | checkCudaErrors( cudaFree(gin) ); 404 | 405 | } else { 406 | reduce2<<>>(gin, gin); 407 | sum += sumf(gin, num_blocks, 1); 408 | 409 | if (remains > 0) 410 | sum += sumf(gin + num_blocks * NUM_THREADS, remains, 1); 411 | } 412 | } 413 | return sum; 414 | } 415 | 416 | /* logarithm version of the reduce kernel function */ 417 | __global__ void 418 | logreduce2(float *g_idata, float *g_odata) 419 | { 420 | __shared__ float sdata[NUM_THREADS]; 421 | 422 | /* load shared mem */ 423 | unsigned int tid = threadIdx.x; 424 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 425 | sdata[tid] = g_idata[i]; 426 | __syncthreads(); 427 | 428 | /* do reduction in shared mem */ 429 | for(unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { 430 | if (tid < s) { 431 | sdata[tid] = logaddd(sdata[tid], sdata[tid + s]); 432 | } 433 | __syncthreads(); 434 | } 435 | 436 | /* write result for this block to global mem */ 437 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 438 | } 439 | 440 | /* logarithm version of the sumf function */ 441 | float logsumf(float *array, int n, int indevice) 442 | { 443 | int i; 444 | int num_blocks; 445 | int remains; 446 | int size; 447 | float sum = - INFINITY; 448 | float *gin; 449 | 450 | /* NUM_THREADS also serves as CPU threshold */ 451 | if (n < NUM_THREADS) { 452 | if (indevice == 0) { 453 | for (i = 0; i < n; i++) { 454 | sum = logadd(sum, array[i]); 455 | } 456 | } else { 457 | float gout[n]; 458 | size = sizeof(float) * n; 459 | checkCudaErrors( cudaMemcpy(gout, array, size, cudaMemcpyDeviceToHost) ); 460 | for (i = 0; i < n; i++) { 461 | sum = logadd(sum, gout[i]); 462 | } 463 | } 464 | } else { 465 | 466 | num_blocks = n / NUM_THREADS; 467 | remains = n - num_blocks * NUM_THREADS; 468 | 469 | dim3 dimBlock(NUM_THREADS); 470 | dim3 dimGrid(num_blocks); 471 | 472 | if (! indevice) { 473 | 474 | size = sizeof(float) * num_blocks * NUM_THREADS; 475 | checkCudaErrors( cudaMalloc((void**) &gin, size) ); 476 | checkCudaErrors( cudaMemcpy(gin, array, size, cudaMemcpyHostToDevice) ); 477 | 478 | logreduce2<<>>(gin, gin); 479 | 480 | sum = logadd(sum, logsumf(gin, num_blocks, IN_DEVICE)); 481 | 482 | if (remains > 0) 483 | sum = logadd(sum, logsumf(array + num_blocks * NUM_THREADS, remains, IN_HOST)); 484 | 485 | checkCudaErrors( cudaFree(gin) ); 486 | 487 | } else { 488 | logreduce2<<>>(gin, gin); 489 | sum = logadd(sum, logsumf(gin, num_blocks, IN_DEVICE)); 490 | 491 | if (remains > 0) 492 | sum = logadd(sum, logsumf(gin + num_blocks * NUM_THREADS, remains, IN_DEVICE)); 493 | } 494 | } 495 | return sum; 496 | } 497 | 498 | /* initilize counts */ 499 | void init_count() { 500 | int size; 501 | size_t i; 502 | float pi[nstates]; 503 | float gmm[nstates * nobvs]; 504 | float xi[nstates * nstates]; 505 | 506 | for (i = 0; i < nstates * nobvs; i++) 507 | gmm[i] = - INFINITY; 508 | 509 | for (i = 0; i < nstates * nstates; i++) 510 | xi[i] = - INFINITY; 511 | 512 | for (i = 0; i < nstates; i++) 513 | pi[i] = - INFINITY; 514 | 515 | size = sizeof(float) * nstates * nstates; 516 | checkCudaErrors( cudaMalloc((void**)&xid, size) ); 517 | checkCudaErrors( cudaMemcpy(xid, xi, size, cudaMemcpyHostToDevice) ); 518 | 519 | size = sizeof(float) * nstates * nobvs; 520 | checkCudaErrors( cudaMalloc((void**)&gmmd, size) ); 521 | checkCudaErrors( cudaMemcpy(gmmd, gmm, size, cudaMemcpyHostToDevice) ); 522 | 523 | size = sizeof(float) * nstates; 524 | checkCudaErrors( cudaMalloc((void**)&pid, size) ); 525 | checkCudaErrors( cudaMemcpy(pid, pi, size, cudaMemcpyHostToDevice) ); 526 | } 527 | 528 | /* add up two logarithm while avoiding overflow */ 529 | float logadd(float x, float y) { 530 | if (y <= x) 531 | return x + log1pf(expf(y - x)); 532 | else 533 | return y + log1pf(expf(x - y)); 534 | } 535 | 536 | /* add up two logarithm while avoiding overflow (device version) */ 537 | __device__ float logaddd(float x, float y) { 538 | if (y <= x) 539 | return x + log1pf(expf(y - x)); 540 | else 541 | return y + log1pf(expf(x - y)); 542 | } 543 | 544 | /* the kernel function for stepfwd */ 545 | __global__ void 546 | stepfwdd(float *pre, float *transd, int *O, float *obvsd, 547 | int nstates, int nobvs, float *A) 548 | { 549 | int bx = blockIdx.x; 550 | int by = blockIdx.y; 551 | 552 | int tx = threadIdx.x; 553 | int ty = threadIdx.y; 554 | 555 | int aBegin = nstates * BLOCK_SIZE * by; 556 | int aEnd = aBegin + nstates - 1; 557 | int aStep = BLOCK_SIZE; 558 | 559 | int bBegin = BLOCK_SIZE * bx; 560 | int bStep = BLOCK_SIZE * nstates; 561 | 562 | float sub = logf(0); 563 | 564 | int a, b, k; 565 | 566 | for (a = aBegin, b = bBegin; 567 | a <= aEnd; 568 | a += aStep, b += bStep) { 569 | 570 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 571 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 572 | __shared__ float Os[BLOCK_SIZE][BLOCK_SIZE]; 573 | 574 | As[ty][tx] = pre[a + nstates * ty + tx]; 575 | Bs[ty][tx] = transd[b + nstates * ty + tx]; 576 | Os[ty][tx] = obvsd[IDX(BLOCK_SIZE * bx + tx, O[BLOCK_SIZE * by + ty], nobvs)]; 577 | 578 | __syncthreads(); 579 | 580 | for (k = 0; k < BLOCK_SIZE; ++k) { 581 | sub = logaddd(sub, As[ty][k] + Bs[k][tx] + Os[ty][tx]); 582 | } 583 | 584 | __syncthreads(); 585 | } 586 | 587 | a = nstates * BLOCK_SIZE * by + BLOCK_SIZE * bx; 588 | A[a + nstates * ty + tx] = sub; 589 | } 590 | 591 | /* compute one step (n-th) forward probability. the data partition 592 | follows matrix muptiplication A x B, where the previous forward 593 | probabilities are A and state transition probabilities are B. */ 594 | void stepfwd(float *alpha, size_t n) 595 | { 596 | int size; 597 | int *Od; 598 | float *A; 599 | float *Ad; 600 | 601 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 602 | dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y); 603 | 604 | size = sizeof(int) * nseq; 605 | checkCudaErrors( cudaMalloc((void**)&Od, size) ); 606 | checkCudaErrors( cudaMemcpy(Od, data + n * nseq, size, cudaMemcpyHostToDevice) ); 607 | 608 | size = sizeof(float) * nstates * nseq; 609 | 610 | checkCudaErrors( cudaMalloc((void**)&A, size) ); 611 | checkCudaErrors( cudaMemcpy(A, alpha + (n - 1) * nseq * nstates, size, cudaMemcpyHostToDevice) ); 612 | 613 | checkCudaErrors( cudaMalloc((void**)&Ad, size) ); 614 | 615 | #ifdef PROFILE_GPU 616 | sdkCreateTimer( &gpu_timer ); 617 | sdkStartTimer(&gpu_timer); 618 | #endif 619 | 620 | stepfwdd<<>>(A, transd, Od, obvsd, nstates, nobvs, Ad); 621 | 622 | #ifdef PROFILE_GPU 623 | checkCudaErrors( cudaThreadSynchronize() ); 624 | sdkStopTimer(&gpu_timer); 625 | gpu_time += sdkGetTimerValue(&gpu_timer); 626 | sdkDeleteTimer( &gpu_timer); 627 | 628 | gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * ((double)nstates) * 7; 629 | #endif 630 | 631 | checkCudaErrors( cudaMemcpy(alpha + n * nseq * nstates, Ad, size, cudaMemcpyDeviceToHost) ); 632 | 633 | checkCudaErrors( cudaFree(Od) ); 634 | checkCudaErrors( cudaFree(A) ); 635 | checkCudaErrors( cudaFree(Ad) ); 636 | } 637 | 638 | /* init first slice of forwad probability matrix. */ 639 | void initfwd0(float *alpha) 640 | { 641 | size_t i, j; 642 | for (i = 0; i < nseq; i++) { 643 | for (j = 0; j < nstates; j++) { 644 | alpha[IDX(i, j, nstates)] = prior[j] + obvs[IDX(j, data[i], nobvs)]; 645 | } 646 | } 647 | } 648 | 649 | __global__ void 650 | initfwdd(float *prior, int *Od, float *obvs, 651 | int nstates, int nobvs, float *Ad) 652 | { 653 | int y = blockIdx.y * BLOCK_SIZE + threadIdx.y; 654 | int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; 655 | 656 | Ad[IDX(y, x, nstates)] = prior[x] + obvs[IDX(x, Od[y], nobvs)]; 657 | } 658 | 659 | void initfwd(float *alpha) 660 | { 661 | int size; 662 | int *Od; 663 | float *Ad; 664 | float *priord; 665 | 666 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 667 | dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y); 668 | 669 | size = sizeof(float) * nstates; 670 | checkCudaErrors( cudaMalloc((void**)&priord, size) ); 671 | checkCudaErrors( cudaMemcpy(priord, prior, size, cudaMemcpyHostToDevice) ); 672 | 673 | size = sizeof(int) * nseq; 674 | checkCudaErrors( cudaMalloc((void**)&Od, size) ); 675 | checkCudaErrors( cudaMemcpy(Od, data, size, cudaMemcpyHostToDevice) ); 676 | 677 | size = sizeof(float) * nstates * nseq; 678 | checkCudaErrors( cudaMalloc((void**)&Ad, size) ); 679 | 680 | 681 | #ifdef PROFILE_GPU 682 | sdkCreateTimer( &gpu_timer ); 683 | sdkStartTimer(&gpu_timer); 684 | #endif 685 | 686 | initfwdd<<>>(priord, Od, obvsd, nstates, nobvs, Ad); 687 | 688 | #ifdef PROFILE_GPU 689 | checkCudaErrors( cudaThreadSynchronize() ); 690 | sdkStopTimer(&gpu_timer); 691 | gpu_time += sdkGetTimerValue(&gpu_timer); 692 | sdkDeleteTimer( &gpu_timer); 693 | 694 | gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * 1.0; 695 | #endif 696 | 697 | checkCudaErrors( cudaMemcpy(alpha, Ad, size, cudaMemcpyDeviceToHost) ); 698 | 699 | checkCudaErrors( cudaFree(Ad) ); 700 | checkCudaErrors( cudaFree(priord) ); 701 | } 702 | 703 | /* kernel function for initbck() */ 704 | __global__ void 705 | initbckd(float *B, int nstates) 706 | { 707 | int bx = blockIdx.x; 708 | int by = blockIdx.y; 709 | int tx = threadIdx.x; 710 | int ty = threadIdx.y; 711 | 712 | size_t i = bx * BLOCK_SIZE + tx; 713 | size_t j = by * BLOCK_SIZE + ty; 714 | 715 | B[IDX(j, i, nstates)] = 0; 716 | } 717 | 718 | /* initial backward probabilities for the last slice. data partition 719 | is the same as initfwd(). the values are all initilized to be 0. */ 720 | void initbck(float *beta) 721 | { 722 | int size; 723 | float *Bd; 724 | 725 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 726 | dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y); 727 | 728 | size = sizeof(float) * nstates * nseq; 729 | checkCudaErrors( cudaMalloc((void**)&Bd, size) ); 730 | 731 | initbckd<<>>(Bd, nstates); 732 | 733 | checkCudaErrors( cudaMemcpy(beta, Bd, size, cudaMemcpyDeviceToHost) ); 734 | 735 | checkCudaErrors( cudaFree(Bd) ); 736 | } 737 | 738 | /* kernel function for updating xi counts */ 739 | __global__ void 740 | updatexid(float *A, float *B, float *transd, int *O, float *obvsd, 741 | int nstates, int nobvs, int nseq, float* loglik, float *xid) 742 | { 743 | int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; 744 | int y = blockIdx.y * BLOCK_SIZE + threadIdx.y; 745 | 746 | float e; 747 | int i; 748 | 749 | for (i = 0; i < nseq; i++) { 750 | e = A[IDX(i, y, nstates)] + transd[IDX(y, x, nstates)] 751 | + B[IDX(i, x, nstates)] + obvsd[IDX(x, O[i], nobvs)] - loglik[i]; 752 | xid[IDX(y, x, nstates)] = logaddd(xid[IDX(y, x, nstates)], e); 753 | } 754 | } 755 | 756 | /* kernel function for updating gamma counts */ 757 | __global__ void 758 | updategmmd(float *A, float *B, int *O, int nstates, 759 | int nobvs, int nseq, float *loglik, float *gmmd) 760 | { 761 | int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; 762 | float e; 763 | int i = 0; 764 | 765 | for (i = 0; i < nseq; i++) { 766 | e = A[IDX(i, x, nstates)] + B[IDX(i, x, nstates)] - loglik[i]; 767 | gmmd[IDX(x, O[i], nobvs)] = logaddd(gmmd[IDX(x, O[i], nobvs)], e); 768 | } 769 | } 770 | 771 | /* kernel function for updateing pi counts */ 772 | __global__ void 773 | updatepid(float *A, float *B, int *O, int nstates, 774 | int nseq, float *loglik, float *pid) 775 | { 776 | int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; 777 | float e; 778 | int i = 0; 779 | 780 | for (i = 0; i < nseq; i++) { 781 | e = A[IDX(i, x, nstates)] + B[IDX(i, x, nstates)] - loglik[i]; 782 | pid[x] = logaddd(pid[x], e); 783 | } 784 | } 785 | 786 | /* kernel function for stepbck() */ 787 | __global__ void 788 | stepbckd(float *pre, float *transd, int *O, float *obvsd, 789 | int nstates, int nobvs, float *B) 790 | { 791 | int bx = blockIdx.x; 792 | int by = blockIdx.y; 793 | 794 | int tx = threadIdx.x; 795 | int ty = threadIdx.y; 796 | 797 | int aBegin = nstates * BLOCK_SIZE * by; 798 | int aEnd = aBegin + nstates - 1; 799 | int aStep = BLOCK_SIZE; 800 | 801 | int bBegin = nstates * BLOCK_SIZE * bx; 802 | int bStep = BLOCK_SIZE; 803 | 804 | float sub = logf(0); 805 | 806 | int a, b, k; 807 | 808 | for (a = aBegin, b = bBegin; 809 | a <= aEnd; 810 | a += aStep, b+= bStep) { 811 | 812 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 813 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 814 | 815 | As[ty][tx] = pre[a + IDX(ty, tx, nstates)] + 816 | obvsd[IDX(a - aBegin + tx, O[by * BLOCK_SIZE + ty], nobvs)]; 817 | 818 | Bs[ty][tx] = transd[b + IDX(tx, ty, nstates)]; 819 | 820 | __syncthreads(); 821 | 822 | for (k = 0; k < BLOCK_SIZE; ++k) { 823 | sub = logaddd(sub, As[ty][k] + Bs[k][tx]); 824 | } 825 | 826 | __syncthreads(); 827 | } 828 | 829 | b = nstates * BLOCK_SIZE * by + BLOCK_SIZE * bx; 830 | B[b + nstates * ty + tx] = sub; 831 | } 832 | 833 | /* compute one step backward probability and update counts. 834 | 835 | data partition for computing backward probilities follows forward 836 | pass. the result of the backward probabities are stored in the 837 | memory pointed by *beta. */ 838 | void stepbck(float *alpha, float *pre, size_t n, float* loglik, float *beta) 839 | { 840 | int size; 841 | int *Od; 842 | 843 | float *Bd; 844 | float *pred; 845 | 846 | float *Ad; 847 | 848 | float *loglikd; 849 | 850 | dim3 xiBlock(BLOCK_SIZE, BLOCK_SIZE); 851 | dim3 xiGrid(nstates / xiBlock.x, nstates / xiBlock.y); 852 | 853 | dim3 gmmBlock(BLOCK_SIZE); 854 | dim3 gmmGrid(nstates / gmmBlock.x); 855 | 856 | dim3 sBlock(BLOCK_SIZE, BLOCK_SIZE); 857 | dim3 sGrid(nstates / sBlock.x, nseq / sBlock.y); 858 | 859 | size = sizeof(int) * nseq; 860 | checkCudaErrors( cudaMalloc((void**)&Od, size) ); 861 | checkCudaErrors( cudaMemcpy(Od, data + (n + 1) * nseq, 862 | size, cudaMemcpyHostToDevice) ); 863 | 864 | checkCudaErrors( cudaMalloc((void**)&loglikd, size) ); 865 | checkCudaErrors( cudaMemcpy(loglikd, loglik, size, cudaMemcpyHostToDevice) ); 866 | 867 | size = sizeof(float) * nstates * nseq; 868 | 869 | checkCudaErrors( cudaMalloc((void**)&pred, size) ); 870 | checkCudaErrors( cudaMemcpy(pred, pre, size, cudaMemcpyHostToDevice) ); 871 | 872 | checkCudaErrors( cudaMalloc((void**)&Ad, size) ); 873 | checkCudaErrors( cudaMemcpy(Ad, alpha + n * nseq * nstates, 874 | size, cudaMemcpyHostToDevice) ); 875 | 876 | /* update counts */ 877 | #ifdef PROFILE_GPU 878 | sdkCreateTimer( &gpu_timer ); 879 | sdkStartTimer(&gpu_timer); 880 | #endif 881 | updatexid<<>>(Ad, pred, transd, Od, obvsd, 882 | nstates, nobvs, nseq, loglikd, xid); 883 | 884 | #ifdef PROFILE_GPU 885 | checkCudaErrors( cudaThreadSynchronize() ); 886 | sdkStopTimer(&gpu_timer); 887 | gpu_time += sdkGetTimerValue(&gpu_timer); 888 | sdkDeleteTimer( &gpu_timer); 889 | 890 | gpu_flop += 1e-6 * ((double) nstates) * ((double) nstates) * ((double) nseq) * 9.0; 891 | #endif 892 | 893 | 894 | checkCudaErrors( cudaMemcpy(Ad, alpha + (n + 1) * nseq * nstates, 895 | size, cudaMemcpyHostToDevice) ); 896 | 897 | #ifdef PROFILE_GPU 898 | sdkCreateTimer( &gpu_timer ); 899 | sdkStartTimer(&gpu_timer); 900 | #endif 901 | 902 | updategmmd<<>>(Ad, pred, Od, nstates, nobvs, nseq, loglikd, gmmd); 903 | 904 | #ifdef PROFILE_GPU 905 | checkCudaErrors( cudaThreadSynchronize() ); 906 | sdkStopTimer(&gpu_timer); 907 | gpu_time += sdkGetTimerValue(&gpu_timer); 908 | sdkDeleteTimer( &gpu_timer); 909 | 910 | gpu_flop += 1e-6 * ((double) nstates) * ((double) nseq) * 7.0; 911 | #endif 912 | 913 | 914 | /* compute one step beta probabilities */ 915 | checkCudaErrors( cudaMalloc((void**)&Bd, size) ); 916 | 917 | #ifdef PROFILE_GPU 918 | sdkCreateTimer( &gpu_timer ); 919 | sdkStartTimer(&gpu_timer); 920 | #endif 921 | 922 | stepbckd<<>>(pred, transd, Od, obvsd, nstates, nobvs, Bd); 923 | 924 | #ifdef PROFILE_GPU 925 | checkCudaErrors( cudaThreadSynchronize() ); 926 | sdkStopTimer(&gpu_timer); 927 | gpu_time += sdkGetTimerValue(&gpu_timer); 928 | sdkDeleteTimer( &gpu_timer); 929 | 930 | gpu_flop += 1e-6 * ((double) nstates) * ((double) nseq) * ((double) nstates) * 6.0; 931 | #endif 932 | 933 | 934 | checkCudaErrors( cudaMemcpy(beta, Bd, size, cudaMemcpyDeviceToHost) ); 935 | 936 | checkCudaErrors( cudaFree(Ad) ); 937 | checkCudaErrors( cudaFree(Od) ); 938 | checkCudaErrors( cudaFree(Bd) ); 939 | checkCudaErrors( cudaFree(pred) ); 940 | checkCudaErrors( cudaFree(loglikd) ); 941 | } 942 | 943 | void last_update(float *alpha, float *beta, float *loglik) 944 | { 945 | int size; 946 | int *Od; 947 | float *Bd, *Ad, *loglikd; 948 | 949 | dim3 gmmBlock(BLOCK_SIZE); 950 | dim3 gmmGrid(nstates / gmmBlock.x); 951 | 952 | dim3 piBlock(BLOCK_SIZE); 953 | dim3 piGrid(nstates / piBlock.x); 954 | 955 | size = sizeof(int) * nseq; 956 | checkCudaErrors( cudaMalloc((void**)&Od, size) ); 957 | checkCudaErrors( cudaMemcpy(Od, data, size, cudaMemcpyHostToDevice) ); 958 | 959 | checkCudaErrors( cudaMalloc((void**)&loglikd, size) ); 960 | checkCudaErrors( cudaMemcpy(loglikd, loglik, size, cudaMemcpyHostToDevice) ); 961 | 962 | size = sizeof(float) * nstates * nseq; 963 | checkCudaErrors( cudaMalloc((void**)&Bd, size) ); 964 | checkCudaErrors( cudaMalloc((void**)&Ad, size) ); 965 | checkCudaErrors( cudaMemcpy(Bd, beta, size, cudaMemcpyHostToDevice) ); 966 | checkCudaErrors( cudaMemcpy(Ad, alpha, size, cudaMemcpyHostToDevice) ); 967 | 968 | 969 | #ifdef PROFILE_GPU 970 | sdkCreateTimer( &gpu_timer ); 971 | sdkStartTimer(&gpu_timer); 972 | #endif 973 | 974 | updategmmd<<>>(Ad, Bd, Od, nstates, nobvs, nseq, loglikd, gmmd); 975 | updatepid<<>>(Ad, Bd, Od, nstates, nseq, loglikd, pid); 976 | 977 | #ifdef PROFILE_GPU 978 | checkCudaErrors( cudaThreadSynchronize() ); 979 | sdkStopTimer(&gpu_timer); 980 | gpu_time += sdkGetTimerValue(&gpu_timer); 981 | sdkDeleteTimer( &gpu_timer); 982 | 983 | gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * 7.0; 984 | gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * 7.0; 985 | #endif 986 | 987 | checkCudaErrors( cudaFree(Ad) ); 988 | checkCudaErrors( cudaFree(Bd) ); 989 | checkCudaErrors( cudaFree(Od) ); 990 | checkCudaErrors( cudaFree(loglikd) ); 991 | } 992 | 993 | /* forwad backward algorithm: running on all sequences in parallel */ 994 | float forward_backward(int backward) 995 | { 996 | float *alpha = NULL; 997 | float *beta = NULL; 998 | float *prebeta = NULL; 999 | size_t i; 1000 | size_t a; 1001 | float *loglik = NULL; 1002 | float p; 1003 | int size; 1004 | float *bckll = NULL; 1005 | 1006 | /* initial alpha probabilities for all data sequences 1007 | (LARGEST memory allocaltion in the entire program) */ 1008 | alpha = (float *) malloc(sizeof(float) * length * nstates * nseq); 1009 | if (alpha == NULL) handle_error("malloc"); 1010 | 1011 | initfwd(alpha); 1012 | 1013 | /* forward pass */ 1014 | for (i = 1; i < length; i++) { 1015 | stepfwd(alpha, i); 1016 | } 1017 | 1018 | loglik = (float *) malloc(sizeof(float) * nseq); 1019 | if (loglik == NULL) handle_error("malloc"); 1020 | for (i = 0, a = (length - 1) * nseq * nstates; 1021 | i < nseq; i++) { 1022 | loglik[i] = logsumf(alpha + a + i * nstates, nstates, IN_HOST); 1023 | } 1024 | p = sumf(loglik, nseq, IN_HOST); 1025 | 1026 | if (! backward) { 1027 | #ifdef PROFILE_PG 1028 | sdkStopTimer(&pg_timer); 1029 | #endif 1030 | for (i = 0; i < nseq; i++) { 1031 | printf("%.4f\n", loglik[i]); 1032 | } 1033 | printf("total: %.4f\n", p); 1034 | #ifdef PROFILE_PG 1035 | sdkStartTimer(&pg_timer); 1036 | #endif 1037 | if (loglik) free(loglik); 1038 | if (alpha) free(alpha); 1039 | return p; 1040 | } 1041 | 1042 | /* initial backward probabilities */ 1043 | size = sizeof(float) * nstates * nseq; 1044 | beta = (float *) malloc(size); 1045 | if (beta == NULL) handle_error("malloc"); 1046 | prebeta = (float *) malloc(size); 1047 | if (prebeta == NULL) handle_error("malloc"); 1048 | 1049 | initbck(prebeta); 1050 | 1051 | /* backward pass & update counts at last step */ 1052 | for (i = 1; i < length; i++) { 1053 | stepbck(alpha, prebeta, length - 1 - i, loglik, beta); 1054 | memmove(prebeta, beta, size); 1055 | } 1056 | 1057 | /* update first slice of data */ 1058 | last_update(alpha, prebeta, loglik); 1059 | 1060 | #ifdef DEBUG 1061 | /* compute backward prob for verification purpose */ 1062 | bckll = (float *) malloc(sizeof(float) * nseq); 1063 | if (bckll == NULL) handle_error("malloc"); 1064 | for (i = 0; i < nseq; i++) { 1065 | p = - INFINITY; 1066 | for (j = 0; j < nstates; j++) { 1067 | p = logadd(p, prior[j] + beta[IDX(i,j,nstates)] + obvs[IDX(j, data[i], nobvs)]); 1068 | } 1069 | bckll[i] = p; 1070 | } 1071 | p = sumf(bckll, nseq, IN_HOST); 1072 | 1073 | for (i = 0; i < nseq; i++) 1074 | if (fabs(bckll[i] - loglik[i]) > 1e-3) 1075 | fprintf(stderr, "Error: forward and backward incompatible: %f, %f\n", 1076 | loglik[i], bckll[i]); 1077 | #endif 1078 | 1079 | if (alpha) free(alpha); 1080 | if (beta) free(beta); 1081 | if (prebeta) free(prebeta); 1082 | if (loglik) free(loglik); 1083 | if (bckll) free(bckll); 1084 | 1085 | return p; 1086 | } 1087 | 1088 | 1089 | /* the kernel function for viterbi algorithm */ 1090 | __global__ void 1091 | viterbi_fwdd(float *prelbd, float *transd, int *O, float *obvsd, 1092 | int nstates, int nobvs, float *lambda, int *backtrace) 1093 | { 1094 | int bx = blockIdx.x; 1095 | int by = blockIdx.y; 1096 | 1097 | int tx = threadIdx.x; 1098 | int ty = threadIdx.y; 1099 | 1100 | int aBegin = nstates * BLOCK_SIZE * by; 1101 | int aEnd = aBegin + nstates - 1; 1102 | int aStep = BLOCK_SIZE; 1103 | 1104 | int bBegin = BLOCK_SIZE * bx; 1105 | int bStep = BLOCK_SIZE * nstates; 1106 | 1107 | float sub = logf(0); 1108 | float p; 1109 | 1110 | int a, b, k, i; 1111 | 1112 | for (a = aBegin, b = bBegin; 1113 | a <= aEnd; 1114 | a += aStep, b += bStep) { 1115 | 1116 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 1117 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 1118 | __shared__ float Os[BLOCK_SIZE][BLOCK_SIZE]; 1119 | 1120 | As[ty][tx] = prelbd[a + nstates * ty + tx]; 1121 | Bs[ty][tx] = transd[b + nstates * ty + tx]; 1122 | Os[ty][tx] = obvsd[IDX(BLOCK_SIZE * bx + tx, O[BLOCK_SIZE * by + ty], nobvs)]; 1123 | 1124 | __syncthreads(); 1125 | 1126 | for (k = 0; k < BLOCK_SIZE; ++k) { 1127 | p = As[ty][k] + Bs[k][tx] + Os[ty][tx]; 1128 | if (p > sub) { 1129 | sub = p; 1130 | i = a + k - aBegin; 1131 | } 1132 | } 1133 | 1134 | __syncthreads(); 1135 | } 1136 | 1137 | a = nstates * BLOCK_SIZE * by + BLOCK_SIZE * bx; 1138 | lambda[a + nstates * ty + tx] = sub; 1139 | backtrace[a + nstates * ty + tx] = i; 1140 | } 1141 | 1142 | void viterbi_fwd(float *prelbd, size_t n, float *lambda, int *backtrace) 1143 | { 1144 | int size; 1145 | int *Od; 1146 | float *pred; 1147 | float *lbdd; 1148 | int *Bd; 1149 | 1150 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 1151 | dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y); 1152 | 1153 | size = sizeof(int) * nseq; 1154 | checkCudaErrors( cudaMalloc((void**)&Od, size) ); 1155 | checkCudaErrors( cudaMemcpy(Od, data + n * nseq, size, cudaMemcpyHostToDevice) ); 1156 | 1157 | size = sizeof(float) * nstates * nseq; 1158 | 1159 | checkCudaErrors( cudaMalloc((void**)&pred, size) ); 1160 | checkCudaErrors( cudaMemcpy(pred, prelbd, size, cudaMemcpyHostToDevice) ); 1161 | 1162 | checkCudaErrors( cudaMalloc((void**)&lbdd, size) ); 1163 | 1164 | size = sizeof(int) * nstates * nseq; 1165 | checkCudaErrors( cudaMalloc((void**)&Bd, size) ); 1166 | 1167 | 1168 | #ifdef PROFILE_GPU 1169 | sdkCreateTimer( &gpu_timer ); 1170 | sdkStartTimer(&gpu_timer); 1171 | #endif 1172 | 1173 | viterbi_fwdd<<>>(pred, transd, Od, obvsd, nstates, nobvs, lbdd, Bd); 1174 | 1175 | #ifdef PROFILE_GPU 1176 | checkCudaErrors( cudaThreadSynchronize() ); 1177 | sdkStopTimer(&gpu_timer); 1178 | gpu_time += sdkGetTimerValue(&gpu_timer); 1179 | sdkDeleteTimer( &gpu_timer); 1180 | 1181 | gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * ((double)nstates) * 3; 1182 | #endif 1183 | 1184 | size = sizeof(float) * nstates * nseq; 1185 | checkCudaErrors( cudaMemcpy(lambda, lbdd, size, cudaMemcpyDeviceToHost) ); 1186 | size = sizeof(int) * nstates * nseq; 1187 | checkCudaErrors( cudaMemcpy(backtrace + n * nstates * nseq, Bd, size, cudaMemcpyDeviceToHost) ); 1188 | 1189 | checkCudaErrors( cudaFree(Od) ); 1190 | checkCudaErrors( cudaFree(pred) ); 1191 | checkCudaErrors( cudaFree(lbdd) ); 1192 | checkCudaErrors( cudaFree(Bd) ); 1193 | } 1194 | 1195 | __global__ void 1196 | fltpd(float *lbdd, size_t nstates, int *stackd) 1197 | { 1198 | int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; 1199 | size_t i, besti; 1200 | float max; 1201 | 1202 | for (i = 0; i < nstates; i++) { 1203 | if (i == 0 || max < lbdd[IDX(x, i, nstates)]) { 1204 | max = lbdd[IDX(x, i, nstates)]; 1205 | besti = i; 1206 | } 1207 | } 1208 | stackd[x] = besti; 1209 | } 1210 | 1211 | void find_last_trace_points(float *lambda, int *stack) 1212 | { 1213 | float *lbdd; 1214 | int *stackd; 1215 | int size; 1216 | dim3 dimBlock(BLOCK_SIZE); 1217 | dim3 dimGrid(nseq / dimBlock.x); 1218 | 1219 | size = sizeof(float) * nstates * nseq; 1220 | checkCudaErrors( cudaMalloc((void**)&lbdd, size) ); 1221 | checkCudaErrors( cudaMemcpy(lbdd, lambda, size, cudaMemcpyHostToDevice) ); 1222 | 1223 | size = sizeof(int) * nseq; 1224 | checkCudaErrors( cudaMalloc((void**)&stackd, size) ); 1225 | 1226 | fltpd<<>>(lbdd, nstates, stackd); 1227 | 1228 | checkCudaErrors( cudaMemcpy(stack, stackd, size, cudaMemcpyDeviceToHost) ); 1229 | 1230 | checkCudaErrors( cudaFree(lbdd) ); 1231 | checkCudaErrors( cudaFree(stackd) ); 1232 | } 1233 | 1234 | __global__ void 1235 | backtraced(int *pre, int *bckpd, int nstates, int *stackd) 1236 | { 1237 | int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; 1238 | stackd[x] = bckpd[IDX(x, pre[x], nstates)]; 1239 | } 1240 | 1241 | void backtrace(int *backtracep, int *stack, int n) 1242 | { 1243 | int *bckpd; 1244 | int *stackd; 1245 | int *pred; 1246 | int size; 1247 | 1248 | dim3 dimBlock(BLOCK_SIZE); 1249 | dim3 dimGrid(nseq / dimBlock.x); 1250 | 1251 | size = sizeof(int) * nseq * nstates; 1252 | checkCudaErrors( cudaMalloc((void**)&bckpd, size) ); 1253 | checkCudaErrors( cudaMemcpy(bckpd, backtracep + (n + 1) * nseq * nstates, size, cudaMemcpyHostToDevice) ); 1254 | 1255 | size = sizeof(int) * nseq; 1256 | checkCudaErrors( cudaMalloc((void**)&pred, size) ); 1257 | checkCudaErrors( cudaMemcpy(pred, stack + (n + 1) * nseq, size, cudaMemcpyHostToDevice) ); 1258 | 1259 | checkCudaErrors( cudaMalloc((void**)&stackd, size) ); 1260 | 1261 | backtraced<<>>(pred, bckpd, nstates, stackd); 1262 | 1263 | checkCudaErrors( cudaMemcpy(stack + n * nseq, stackd, size, cudaMemcpyDeviceToHost) ); 1264 | 1265 | checkCudaErrors( cudaFree(bckpd) ); 1266 | checkCudaErrors( cudaFree(stackd) ); 1267 | checkCudaErrors( cudaFree(pred) ); 1268 | } 1269 | 1270 | void print_path(int *stack) 1271 | { 1272 | size_t i, j; 1273 | for (i = 0; i < nseq; i++) { 1274 | for (j = 0; j < length; j++) { 1275 | printf("%d ", stack[IDX(j, i, nseq)]); 1276 | } 1277 | printf("\n"); 1278 | } 1279 | } 1280 | 1281 | void viterbi() 1282 | { 1283 | float *lambda = NULL; 1284 | float *prelbd = NULL; 1285 | int *backtracep = NULL; 1286 | int *stack = NULL; 1287 | int size; 1288 | size_t i; 1289 | 1290 | backtracep = (int *) malloc(sizeof(float) * length * nstates * nseq); 1291 | if (backtracep == NULL) handle_error("malloc"); 1292 | 1293 | size = sizeof(float) * nstates * nseq; 1294 | 1295 | lambda = (float *) malloc(size); 1296 | if (lambda == NULL) handle_error("malloc"); 1297 | 1298 | prelbd = (float *) malloc(size); 1299 | if (prelbd == NULL) handle_error("malloc"); 1300 | 1301 | initfwd(prelbd); 1302 | 1303 | for (i = 1; i < length; i++) { 1304 | viterbi_fwd(prelbd, i, lambda, backtracep); 1305 | memmove(prelbd, lambda, size); 1306 | } 1307 | 1308 | stack = (int*) malloc(sizeof(int) * nseq * length); 1309 | if (stack == NULL) handle_error("malloc"); 1310 | 1311 | find_last_trace_points(lambda, stack + (length - 1) * nseq); 1312 | for (i = 1; i < length; i++) { 1313 | backtrace(backtracep, stack, length - 1 - i); 1314 | } 1315 | 1316 | print_path(stack); 1317 | 1318 | free(lambda); 1319 | free(prelbd); 1320 | free(backtracep); 1321 | free(stack); 1322 | } 1323 | 1324 | /* update model parameters using estimated counts */ 1325 | void update_prob() 1326 | { 1327 | float pisum; 1328 | float gmmsum[nstates]; 1329 | float xisum[nstates]; 1330 | float pi[nstates]; 1331 | 1332 | float gmm[nstates * nobvs]; 1333 | float xi[nstates * nstates]; 1334 | 1335 | size_t i, j; 1336 | 1337 | checkCudaErrors( cudaMemcpy(xi, xid, nstates * nstates * sizeof(float), cudaMemcpyDeviceToHost) ); 1338 | checkCudaErrors( cudaMemcpy(gmm, gmmd, nobvs * nstates * sizeof(float), cudaMemcpyDeviceToHost) ); 1339 | checkCudaErrors( cudaMemcpy(pi, pid, nstates * sizeof(float), cudaMemcpyDeviceToHost) ); 1340 | 1341 | if (gmmd) checkCudaErrors( cudaFree(gmmd) ); 1342 | if (xid) checkCudaErrors( cudaFree(xid) ); 1343 | if (pid) checkCudaErrors( cudaFree(pid) ); 1344 | 1345 | pisum = logsumf(pi, nstates, IN_HOST); 1346 | for (i = 0; i < nstates; i++) { 1347 | gmmsum[i] = logsumf(gmm + i * nobvs, nobvs, IN_HOST); 1348 | xisum[i] = logsumf(xi + i * nstates, nstates, IN_HOST); 1349 | } 1350 | 1351 | for (i = 0; i < nstates; i++) { 1352 | prior[i] = pi[i] - pisum; 1353 | } 1354 | 1355 | for (i = 0; i < nstates; i++) { 1356 | for (j = 0; j < nstates; j++) { 1357 | trans[IDX(i,j,nstates)] = xi[IDX(i,j,nstates)] - xisum[i]; 1358 | } 1359 | for (j = 0; j < nobvs; j++) { 1360 | obvs[IDX(i,j,nobvs)] = gmm[IDX(i,j,nobvs)] - gmmsum[i]; 1361 | } 1362 | } 1363 | 1364 | /* update indevice parameters */ 1365 | checkCudaErrors( cudaMemcpy(transd, trans, nstates * nstates * sizeof(float), cudaMemcpyHostToDevice) ); 1366 | checkCudaErrors( cudaMemcpy(obvsd, obvs, nobvs * nstates * sizeof(float), cudaMemcpyHostToDevice) ); 1367 | } 1368 | 1369 | void usage() { 1370 | fprintf(stdout, "chmm [-hnt] [-c config] [-p(1|2|3)]\n"); 1371 | fprintf(stdout, "usage:\n"); 1372 | fprintf(stdout, " -h help\n"); 1373 | fprintf(stdout, " -c configuration file\n"); 1374 | fprintf(stdout, " -t output computation time\n"); 1375 | fprintf(stdout, " -p1 compute the probability of the observation sequence\n"); 1376 | fprintf(stdout, " -p2 compute the most probable sequence (Viterbi)\n"); 1377 | fprintf(stdout, " -p3 train hidden Markov mode parameters (Baum-Welch)\n"); 1378 | fprintf(stdout, " -n number of iterations\n"); 1379 | } 1380 | 1381 | /* free all memory */ 1382 | void freeall() { 1383 | 1384 | if (transd) checkCudaErrors( cudaFree(transd) ); 1385 | if (obvsd) checkCudaErrors( cudaFree(obvsd) ); 1386 | 1387 | if (trans) free(trans); 1388 | if (obvs) free(obvs); 1389 | if (prior) free(prior); 1390 | if (data) free(data); 1391 | } 1392 | --------------------------------------------------------------------------------