├── TODO
├── Makefile
├── README.md
├── fhmm.c
├── hmm.c
└── hmm.cu


/TODO:
--------------------------------------------------------------------------------
 1 | TODO list
 2 | ---
 3 | 
 4 | * urgent
 5 |   - simplify the code: use CPU for any procedure runs in O(N x M)
 6 |   - modified backward pass to avoid large memory move
 7 |   - better matrix mutiplication (maybe use CUBLAS library)
 8 | 
 9 | * further improvement
10 |   - 2D, 3D reducing
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | INC = -I$(CUDA_SDK_PATH)/samples/common/inc
 2 | LIB = -L$(CUDA_SDK_PATH)/lib64
 3 | PG = -Xcompiler -DPROFILE_PG
 4 | PGPU = -Xcompiler -DPROFILE_GPU
 5 | 
 6 | all: cuhmm hmm fhmm
 7 | 
 8 | cuhmm: hmm.cu
 9 | 	nvcc $(INC) $(LIB) hmm.cu -o cuhmm
10 | 
11 | hmm: hmm.c
12 | 	gcc hmm.c -Wall -lm -o hmm
13 | 
14 | fhmm: fhmm.c
15 | 	gcc fhmm.c -Wall -lm -o fhmm
16 | 
17 | clean: hmm
18 | 	rm -f hmm cuhmm fhmm
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | This is an implementation of hidden Markov model (HMM) training and classification for NVIDIA CUDA platform. A serial implementation in C is also included for comparison.
  3 | 
  4 | The implementation of HMM follows the tutorial paper by Rabiner. The three problem for HMM defined in the paper are:
  5 | 
  6 | 1. compute the probability of the observation sequence
  7 | 2. compute the most probable sequence
  8 | 3. train hidden Markov mode parameters
  9 | 
 10 | This implementation supports all the three problems. However there is no support for continuous densities.
 11 | 
 12 | # Usage
 13 | The command line usage is as follows.
 14 | 
 15 | ```
 16 | $ ./hmm -h
 17 | hmm [-hnt] [-c config] [-p(1|2|3)]
 18 | usage:
 19 |   -h   help
 20 |   -c   configuration file
 21 |   -t   output computation time
 22 |   -p1  compute the probability of the observation sequence
 23 |   -p2  compute the most probable sequence (Viterbi)
 24 |   -p3  train hidden Markov mode parameters (Baum-Welch)
 25 |   -n   number of iterations
 26 | ```
 27 | 
 28 | # Configuration
 29 | The configuration format is same for all the three problems. For problem 1, the forward probabilities for all input sequences are computed from the given model parameters. For problem 2, decoding is performed for all sequences based on the given parameters. For problem 3, the given input are used as training data.
 30 | 
 31 | The following example shows a 16 states HMM with 2 observation symbols and 32 input sequences. Empty line and line begins with # will be ignored. The order of parameters follows the example.
 32 | 
 33 | ```
 34 | # a HMM model configuration for testing purpose
 35 | 
 36 | # number of states
 37 | 16
 38 | 
 39 | # number of output
 40 | 2
 41 | 
 42 | # initial state probability
 43 | 0.04 0.02 0.06 0.04 0.11 0.11 0.01 0.09 0.03 0.05 0.06 0.11 0.05 0.11 0.03 0.08 
 44 | 
 45 | # state transition probability
 46 | 0.08 0.02 0.10 0.05 0.07 0.08 0.07 0.04 0.08 0.10 0.07 0.02 0.01 0.10 0.09 0.01 
 47 | 0.06 0.10 0.11 0.01 0.04 0.11 0.04 0.07 0.08 0.10 0.08 0.02 0.09 0.05 0.02 0.02 
 48 | 0.08 0.07 0.08 0.07 0.01 0.03 0.10 0.02 0.07 0.03 0.06 0.08 0.03 0.10 0.10 0.08 
 49 | 0.08 0.04 0.04 0.05 0.07 0.08 0.01 0.08 0.10 0.07 0.11 0.01 0.05 0.04 0.11 0.06 
 50 | 0.03 0.03 0.08 0.10 0.11 0.04 0.06 0.03 0.03 0.08 0.03 0.07 0.10 0.11 0.07 0.03 
 51 | 0.02 0.05 0.01 0.09 0.05 0.09 0.05 0.12 0.09 0.07 0.01 0.07 0.05 0.05 0.11 0.06 
 52 | 0.11 0.05 0.10 0.07 0.01 0.08 0.05 0.03 0.03 0.10 0.01 0.10 0.08 0.09 0.07 0.02 
 53 | 0.03 0.02 0.16 0.01 0.05 0.01 0.14 0.14 0.02 0.05 0.01 0.09 0.07 0.14 0.03 0.01 
 54 | 0.01 0.09 0.13 0.01 0.02 0.04 0.05 0.03 0.10 0.05 0.06 0.06 0.11 0.06 0.03 0.14 
 55 | 0.09 0.03 0.04 0.05 0.04 0.03 0.12 0.04 0.07 0.02 0.07 0.10 0.11 0.03 0.06 0.09 
 56 | 0.09 0.04 0.06 0.06 0.05 0.07 0.05 0.01 0.05 0.10 0.04 0.08 0.05 0.08 0.08 0.10 
 57 | 0.07 0.06 0.01 0.07 0.06 0.09 0.01 0.06 0.07 0.07 0.08 0.06 0.01 0.11 0.09 0.05 
 58 | 0.03 0.04 0.06 0.06 0.06 0.05 0.02 0.10 0.11 0.07 0.09 0.05 0.05 0.05 0.11 0.08 
 59 | 0.04 0.03 0.04 0.09 0.10 0.09 0.08 0.06 0.04 0.07 0.09 0.02 0.05 0.08 0.04 0.09 
 60 | 0.05 0.07 0.02 0.08 0.06 0.08 0.05 0.05 0.07 0.06 0.10 0.07 0.03 0.05 0.06 0.10 
 61 | 0.11 0.03 0.02 0.11 0.11 0.01 0.02 0.08 0.05 0.08 0.11 0.03 0.02 0.10 0.01 0.11 
 62 | 
 63 | # state output probability
 64 | 0.01 0.99 
 65 | 0.58 0.42 
 66 | 0.48 0.52 
 67 | 0.58 0.42 
 68 | 0.37 0.63 
 69 | 0.33 0.67 
 70 | 0.51 0.49 
 71 | 0.28 0.72 
 72 | 0.35 0.65 
 73 | 0.61 0.39 
 74 | 0.97 0.03 
 75 | 0.87 0.13 
 76 | 0.46 0.54 
 77 | 0.55 0.45 
 78 | 0.23 0.77 
 79 | 0.76 0.24 
 80 | 
 81 | # data size
 82 | 32 10
 83 | 
 84 | # data
 85 | 0 0 0 0 0 0 1 0 1 1 
 86 | 1 1 0 0 1 1 1 0 0 0 
 87 | 1 1 0 1 0 0 0 1 0 1 
 88 | 1 1 1 1 1 0 1 1 1 0 
 89 | 0 1 0 1 1 0 1 1 1 1 
 90 | 1 0 1 1 0 1 0 1 1 1 
 91 | 1 0 1 1 1 1 0 0 1 1 
 92 | 0 1 0 1 1 1 0 0 0 0 
 93 | 0 1 1 0 0 0 1 1 1 1 
 94 | 0 1 1 0 0 0 0 1 1 0 
 95 | 1 1 1 1 1 0 1 1 0 0 
 96 | 0 0 0 0 1 1 0 1 1 1 
 97 | 1 0 1 0 1 1 1 1 1 0 
 98 | 1 0 0 1 1 1 0 0 0 0 
 99 | 0 0 1 1 1 0 0 0 0 1 
100 | 1 0 1 1 0 1 0 1 0 0 
101 | 1 0 1 0 1 0 0 1 0 1 
102 | 0 0 0 1 0 0 0 1 0 1 
103 | 1 1 1 0 0 0 0 1 0 0 
104 | 0 1 0 1 1 1 1 1 1 1 
105 | 0 0 0 0 1 1 1 0 1 0 
106 | 0 1 1 1 0 1 0 1 0 0 
107 | 1 1 0 1 1 0 0 0 0 1 
108 | 0 0 0 0 1 1 0 0 1 1 
109 | 0 1 0 1 1 1 1 1 0 0 
110 | 0 1 1 1 0 1 1 0 1 1 
111 | 1 1 1 1 0 0 1 1 0 0 
112 | 1 1 0 1 1 0 0 0 0 0 
113 | 0 1 0 0 0 0 0 0 0 1 
114 | 1 0 0 1 0 1 0 0 1 1 
115 | 0 1 0 1 0 0 1 1 0 0 
116 | 0 0 1 0 1 1 1 1 0 0 
117 | ```
118 | 
119 | # Further Information
120 | For more detailed information, please refer to the report at [https://liuchuan.org/pub/cuHMM.pdf](https://www.liuchuan.org/pub/cuHMM.pdf)
121 | 


--------------------------------------------------------------------------------
/fhmm.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009, Chuan Liu <chuan@cs.jhu.edu>
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person
  5 |  * obtaining a copy of this software and associated documentation
  6 |  * files (the "Software"), to deal in the Software without
  7 |  * restriction, including without limitation the rights to use, copy,
  8 |  * modify, merge, publish, distribute, sublicense, and/or sell copies
  9 |  * of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be
 13 |  * included in all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 18 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19 |  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20 |  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  *
 24 |  */
 25 | 
 26 | #ifndef _GNU_SOURCE
 27 | #define _GNU_SOURCE
 28 | #endif
 29 | 
 30 | #include <math.h>
 31 | #include <stdio.h>
 32 | #include <stdlib.h>
 33 | #include <unistd.h>
 34 | 
 35 | #define handle_error(msg) \
 36 |   do { perror(msg); exit(EXIT_FAILURE); } while (0)
 37 | 
 38 | #define IDX(i,j,d) (((i)*(d))+(j))
 39 | 
 40 | 
 41 | int nstates = 0;                /* number of states */
 42 | int nobvs = 0;                  /* number of observations */
 43 | int nseq = 0;                   /* number of data sequences  */
 44 | int length = 0;                 /* data sequencel length */
 45 | float *prior = NULL;            /* initial state probabilities */
 46 | float *trans = NULL;            /* state transition probabilities */
 47 | float *obvs = NULL;             /* output probabilities */
 48 | int *data = NULL;
 49 | float *gmm = NULL;              /* gamma */
 50 | float *xi = NULL;               /* xi */
 51 | float *pi = NULL;               /* pi */
 52 | 
 53 | float logadd(float, float);
 54 | float sumf(float *, int);
 55 | float forward_backward(int *, size_t, int);
 56 | void viterbi(int *, size_t);
 57 | void init_count();
 58 | void update_prob();
 59 | void usage();
 60 | void freeall();
 61 | 
 62 | int main(int argc, char *argv[])
 63 | {
 64 |   char *configfile = NULL;
 65 |   FILE *fin, *bin;
 66 | 
 67 |   char *linebuf = NULL;
 68 |   size_t buflen = 0;
 69 | 
 70 |   int iterations = 3;
 71 |   int mode = 3;
 72 | 
 73 |   int c;
 74 |   float d;
 75 |   float *loglik;
 76 |   float p;
 77 |   int i, j, k;
 78 |   opterr = 0;
 79 | 
 80 | 
 81 |   while ((c = getopt(argc, argv, "c:n:hp:")) != -1) {
 82 |     switch (c) {
 83 |     case 'c':
 84 |       configfile = optarg;
 85 |       break;
 86 |     case 'h':
 87 |       usage();
 88 |       exit(EXIT_SUCCESS);
 89 |     case 'n':
 90 |       iterations = atoi(optarg);
 91 |       break;
 92 |     case 'p':
 93 |       mode = atoi(optarg);
 94 |       if (mode != 1 && mode != 2 && mode != 3) {
 95 |         fprintf(stderr, "illegal mode: %d\n", mode);
 96 |         exit(EXIT_FAILURE);
 97 |       }
 98 |       break;
 99 |     case '?':
100 |       fprintf(stderr, "illegal options\n");
101 |       exit(EXIT_FAILURE);
102 |     default:
103 |       abort();
104 |     }
105 |   }
106 | 
107 |   if (configfile == NULL) {
108 |     fin = stdin;
109 |   } else {
110 |     fin = fopen(configfile, "r");
111 |     if (fin == NULL) {
112 |       handle_error("fopen");
113 |     }
114 |   }
115 |   
116 |   i = 0;
117 |   while ((c = getline(&linebuf, &buflen, fin)) != -1) {
118 |     if (c <= 1 || linebuf[0] == '#')
119 |       continue;
120 |     
121 |     if (i == 0) {
122 |       if (sscanf(linebuf, "%d", &nstates) != 1) {
123 |         fprintf(stderr, "config file format error: %d\n", i);
124 |         freeall();
125 |         exit(EXIT_FAILURE);
126 |       }
127 | 
128 |       prior = (float *) malloc(sizeof(float) * nstates);
129 |       if (prior == NULL) handle_error("malloc");
130 | 
131 |       trans = (float *) malloc(sizeof(float) * nstates * nstates);
132 |       if (trans == NULL) handle_error("malloc");
133 | 
134 |       xi = (float *) malloc(sizeof(float) * nstates * nstates);
135 |       if (xi == NULL) handle_error("malloc");
136 | 
137 |       pi = (float *) malloc(sizeof(float) * nstates);
138 |       if (pi == NULL) handle_error("malloc");
139 | 
140 |     } else if (i == 1) {
141 |       if (sscanf(linebuf, "%d", &nobvs) != 1) {
142 |         fprintf(stderr, "config file format error: %d\n", i);
143 |         freeall();
144 |         exit(EXIT_FAILURE);
145 |       }
146 | 
147 |       obvs = (float *) malloc(sizeof(float) * nstates * nobvs);
148 |       if (obvs == NULL) handle_error("malloc");
149 | 
150 |       gmm = (float *) malloc(sizeof(float) * nstates * nobvs);
151 |       if (gmm == NULL) handle_error("malloc");
152 | 
153 |     } else if (i == 2) {
154 |       /* read initial state probabilities */ 
155 |       bin = fmemopen(linebuf, buflen, "r");
156 |       if (bin == NULL) handle_error("fmemopen");
157 |       for (j = 0; j < nstates; j++) {
158 |         if (fscanf(bin, "%f", &d) != 1) {
159 |           fprintf(stderr, "config file format error: %d\n", i);
160 |           freeall();
161 |           exit(EXIT_FAILURE);
162 |         }
163 |         prior[j] = logf(d);
164 |       }
165 |       fclose(bin);
166 | 
167 |     } else if (i <= 2 + nstates) {
168 |       /* read state transition  probabilities */ 
169 |       bin = fmemopen(linebuf, buflen, "r");
170 |       if (bin == NULL) handle_error("fmemopen");
171 |       for (j = 0; j < nstates; j++) {
172 |         if (fscanf(bin, "%f", &d) != 1) {
173 |           fprintf(stderr, "config file format error: %d\n", i);
174 |           freeall();
175 |           exit(EXIT_FAILURE);
176 |         }
177 |         trans[IDX((i - 3),j,nstates)] = logf(d);
178 |       }
179 |       fclose(bin);
180 |     } else if (i <= 2 + nstates * 2) {
181 |       /* read output probabilities */
182 |       bin = fmemopen(linebuf, buflen, "r");
183 |       if (bin == NULL) handle_error("fmemopen");
184 |       for (j = 0; j < nobvs; j++) {
185 |         if (fscanf(bin, "%f", &d) != 1) {
186 |           fprintf(stderr, "config file format error: %d\n", i);
187 |           freeall();
188 |           exit(EXIT_FAILURE);
189 |         }
190 |         obvs[IDX((i - 3 - nstates),j,nobvs)] = logf(d);
191 |       }
192 |       fclose(bin);
193 |     } else if (i == 3 + nstates * 2) {
194 |       if (sscanf(linebuf, "%d %d", &nseq, &length) != 2) {
195 |         fprintf(stderr, "config file format error: %d\n", i);
196 |         freeall();
197 |         exit(EXIT_FAILURE);
198 |       }
199 |       data = (int *) malloc (sizeof(int) * nseq * length);
200 |       if (data == NULL) handle_error("malloc");
201 |     } else if (i <= 3 + nstates * 2 + nseq) {
202 |       /* read data */
203 |       bin = fmemopen(linebuf, buflen, "r");
204 |       if (bin == NULL) handle_error("fmemopen");
205 |       for (j = 0; j < length; j++) {
206 |         if (fscanf(bin, "%d", &k) != 1 || k < 0 || k >= nobvs) {
207 |           fprintf(stderr, "config file format error: %d\n", i);
208 |           freeall();
209 |           exit(EXIT_FAILURE);
210 |         }
211 |         data[(i - 4 - nstates * 2) * length + j] = k;
212 |       }
213 |       fclose(bin);
214 |     }
215 | 
216 |     i++;
217 |   }
218 |   fclose(fin);
219 |   if (linebuf) free(linebuf);
220 | 
221 |   if (i < 4 + nstates * 2 + nseq) {
222 |     fprintf(stderr, "configuration incomplete.\n");
223 |     freeall();
224 |     exit(EXIT_FAILURE);
225 |   }
226 | 
227 |   if (mode == 3) {
228 |     loglik = (float *) malloc(sizeof(float) * nseq);
229 |     if (loglik == NULL) handle_error("malloc");
230 | 
231 |     for (i = 0; i < iterations; i++) {
232 |       init_count();
233 |       for (j = 0; j < nseq; j++) {
234 |         loglik[j] = forward_backward(data + length * j, length, 1);
235 |       }
236 |       p = sumf(loglik, nseq);
237 | 
238 |       update_prob();
239 | 
240 |       printf("iteration %d log-likelihood: %.4f\n", i + 1, p);
241 |       printf("updated parameters:\n");
242 |       printf("# initial state probability\n");
243 |       for (j = 0; j < nstates; j++) {
244 |         printf(" %.4f", exp(prior[j]));
245 |       }
246 |       printf("\n");
247 |       printf("# state transition probability\n");
248 |       for (j = 0; j < nstates; j++) {
249 |         for (k = 0; k < nstates; k++) {
250 |           printf(" %.4f", exp(trans[IDX(j,k,nstates)]));
251 |         }
252 |         printf("\n");
253 |       }
254 |       printf("# state output probility\n");
255 |       for (j = 0; j < nstates; j++) {
256 |         for (k = 0; k < nobvs; k++) {
257 |           printf(" %.4f", exp(obvs[IDX(j,k,nobvs)]));
258 |         }
259 |         printf("\n");
260 |       }
261 |       printf("\n");
262 |     }
263 |     free(loglik);
264 |   } else if (mode == 2) {
265 |     for (i = 0; i < nseq; i++) {
266 |       viterbi(data + length * i, length);
267 |     }
268 |   } else if (mode == 1) {
269 |     loglik = (float *) malloc(sizeof(float) * nseq);
270 |     if (loglik == NULL) handle_error("malloc");
271 |     for (i = 0; i < nseq; i++) {
272 |       loglik[i] = forward_backward(data + length * i, length, 0);
273 |     }
274 |     p = sumf(loglik, nseq);
275 | 
276 |     for (i = 0; i < nseq; i++)
277 |       printf("%.4f\n", loglik[i]);
278 |     printf("total: %.4f\n", p);
279 |     free(loglik);
280 |   }
281 | 
282 |   freeall();
283 |   return 0;
284 | }
285 | 
286 | /* compute sum of the array using Kahan summation algorithm */
287 | float sumf(float *data, int size)
288 | {
289 |   float sum = data[0];
290 |   int i;
291 |   float y, t;
292 |   float c = 0.0;
293 |   for (i = 1; i < size; i++) {
294 |     y = data[i] - c;
295 |     t = sum + y;
296 |     c = (t - sum) - y;
297 |     sum = t;
298 |   }
299 |   return sum;
300 | }
301 | 
302 | /* initilize counts */
303 | void init_count() {
304 |   size_t i;
305 |   for (i = 0; i < nstates * nobvs; i++)
306 |     gmm[i] = - INFINITY;
307 | 
308 |   for (i = 0; i < nstates * nstates; i++)
309 |     xi[i] = - INFINITY;
310 | 
311 |   for (i = 0; i < nstates; i++)
312 |     pi[i] = - INFINITY;
313 | }
314 | 
315 | void update_prob() {
316 |   float pisum = - INFINITY;
317 |   float gmmsum[nstates];
318 |   float xisum[nstates];
319 |   size_t i, j;
320 | 
321 |   for (i = 0; i < nstates; i++) {
322 |     gmmsum[i] = - INFINITY;
323 |     xisum[i] = - INFINITY;
324 | 
325 |     pisum = logadd(pi[i], pisum);
326 |   }
327 | 
328 |   for (i = 0; i < nstates; i++) {
329 |     prior[i] = pi[i] - pisum;
330 |   }
331 | 
332 |   for (i = 0; i < nstates; i++) {
333 |     for (j = 0; j < nstates; j++) {
334 |       xisum[i] = logadd(xisum[i], xi[IDX(i,j,nstates)]);
335 |     }
336 |     for (j = 0; j < nobvs; j++) {
337 |       gmmsum[i] = logadd(gmmsum[i], gmm[IDX(i,j,nobvs)]);
338 |     }
339 |   }
340 | 
341 |   for (i = 0; i < nstates; i++) {
342 |     for (j = 0; j < nstates; j++) {
343 |       trans[IDX(i,j,nstates)] = xi[IDX(i,j,nstates)] - xisum[i];
344 |     }
345 |     for (j = 0; j < nobvs; j++) {
346 |       obvs[IDX(i,j,nobvs)] = gmm[IDX(i,j,nobvs)] - gmmsum[i];
347 |     }
348 |   }
349 | 
350 | }
351 | 
352 | /* forward backward algoritm: return observation likelihood */
353 | float forward_backward(int *data, size_t len, int backward)
354 | {
355 |   /* construct trellis */
356 |   float alpha[len][nstates];
357 |   float beta[len][nstates];
358 | 
359 |   size_t i, j, k;
360 |   float p, e;
361 |   float loglik;
362 | 
363 |   for (i = 0; i < len; i++) {
364 |     for (j = 0; j < nstates; j++) {
365 |       alpha[i][j] = - INFINITY;
366 |       beta[i][j] = - INFINITY;
367 |     }
368 |   }
369 | 
370 |   /* forward pass */
371 |   for (i = 0; i < nstates; i++) {
372 |     alpha[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)];
373 |   }
374 |   for (i = 1; i < len; i++) {
375 |     for (j = 0; j < nstates; j++) {
376 |       for (k = 0; k < nstates; k++) {
377 |         p = alpha[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)];
378 |         alpha[i][j] = logadd(alpha[i][j], p);
379 |       }
380 |     }
381 |   }
382 |   loglik = -INFINITY;
383 |   for (i = 0; i < nstates; i++) {
384 |     loglik = logadd(loglik, alpha[len-1][i]);
385 |   }
386 | 
387 |   if (! backward)
388 |     return loglik;
389 | 
390 |   /* backward pass & update counts */
391 |   for (i = 0; i < nstates; i++) {
392 |     beta[len-1][i] = 0;         /* 0 = log (1.0) */
393 |   }
394 |   for (i = 1; i < len; i++) {
395 |     for (j = 0; j < nstates; j++) {
396 | 
397 |       e = alpha[len-i][j] + beta[len-i][j] - loglik;
398 |       gmm[IDX(j,data[len-i],nobvs)] = logadd(gmm[IDX(j,data[len-i],nobvs)], e);
399 | 
400 |       for (k = 0; k < nstates; k++) {
401 |         p = beta[len-i][k] + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)];
402 |         beta[len-1-i][j] = logadd(beta[len-1-i][j], p);
403 | 
404 |         e = alpha[len-1-i][j] + beta[len-i][k]
405 |           + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)] - loglik;
406 |         xi[IDX(j,k,nstates)] = logadd(xi[IDX(j,k,nstates)], e);
407 |       }
408 |     }
409 |   }
410 |   p = -INFINITY;
411 |   for (i = 0; i < nstates; i++) {
412 |     p = logadd(p, prior[i] + beta[0][i] + obvs[IDX(i,data[0],nobvs)]);
413 | 
414 |     e = alpha[0][i] + beta[0][i] - loglik;
415 |     gmm[IDX(i,data[0],nobvs)] = logadd(gmm[IDX(i,data[0],nobvs)], e);
416 | 
417 |     pi[i] = logadd(pi[i], e);
418 |   }
419 | 
420 | #ifdef DEBUG
421 |   /* verify if forward prob == backward prob */
422 |   if (fabs(p - loglik) > 1e-3) {
423 |     fprintf(stderr, "Error: forward and backward incompatible: %f, %f\n", loglik, p);
424 |   }
425 | #endif
426 | 
427 |   return loglik;
428 | }
429 | 
430 | /* find the most probable sequence */
431 | void viterbi(int *data, size_t len)
432 | {
433 |   float lambda[len][nstates];
434 |   int backtrace[len][nstates];
435 |   int stack[len];
436 | 
437 |   size_t i, j, k;
438 |   float p;
439 | 
440 |   for (i = 0; i < len; i++) {
441 |     for (j = 0; j < nstates; j++) {
442 |       lambda[i][j] = - INFINITY;
443 |     }
444 |   }
445 | 
446 |   for (i = 0; i < nstates; i++) {
447 |     lambda[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)];
448 |     backtrace[0][i] = -1;       /* -1 is starting point */
449 |   }
450 |   for (i = 1; i < len; i++) {
451 |     for (j = 0; j < nstates; j++) {
452 |       for (k = 0; k < nstates; k++) {
453 |         p = lambda[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)];
454 |         if (p > lambda[i][j]) {
455 |           lambda[i][j] = p;
456 |           backtrace[i][j] = k;
457 |         }
458 |       }
459 |     }
460 |   }
461 | 
462 |   /* backtrace */
463 |   for (i = 0; i < nstates; i++) {
464 |     if (i == 0 || lambda[len-1][i] > p) {
465 |       p = lambda[len-1][i];
466 |       k = i;
467 |     }
468 |   }
469 |   stack[len - 1] = k;
470 |   for (i = 1; i < len; i++) {
471 |     stack[len - 1 - i] = backtrace[len - i][stack[len - i]];
472 |   }
473 |   for (i = 0; i < len; i++) {
474 |     printf("%d ", stack[i]);
475 |   }
476 |   printf("\n");
477 | }
478 | 
479 | float logadd(float x, float y) {
480 |   if (y <= x)
481 |     return x + log1pf(expf(y - x));
482 |   else
483 |     return y + log1pf(expf(x - y));
484 | }
485 | 
486 | void usage() {
487 |   fprintf(stdout, "hmm [-hnt] [-c config] [-p(1|2|3)]\n");
488 |   fprintf(stdout, "usage:\n");
489 |   fprintf(stdout, "  -h   help\n");
490 |   fprintf(stdout, "  -c   configuration file\n");
491 |   fprintf(stdout, "  -t   output computation time\n");
492 |   fprintf(stdout, "  -p1  compute the probability of the observation sequence\n");
493 |   fprintf(stdout, "  -p2  compute the most probable sequence (Viterbi)\n");
494 |   fprintf(stdout, "  -p3  train hidden Markov mode parameters (Baum-Welch)\n");
495 |   fprintf(stdout, "  -n   number of iterations\n");
496 | }
497 | 
498 | /* free all memory */
499 | void freeall() {
500 |   if (trans) free(trans);
501 |   if (obvs) free(obvs);
502 |   if (prior) free(prior);
503 |   if (data) free(data);
504 |   if (gmm) free(gmm);
505 |   if (xi) free(xi);
506 |   if (pi) free(pi);
507 | }
508 | 


--------------------------------------------------------------------------------
/hmm.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009, Chuan Liu <chuan@cs.jhu.edu>
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person
  5 |  * obtaining a copy of this software and associated documentation
  6 |  * files (the "Software"), to deal in the Software without
  7 |  * restriction, including without limitation the rights to use, copy,
  8 |  * modify, merge, publish, distribute, sublicense, and/or sell copies
  9 |  * of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be
 13 |  * included in all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 18 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19 |  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20 |  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  *
 24 |  */
 25 | 
 26 | 
 27 | #ifndef _GNU_SOURCE
 28 | #define _GNU_SOURCE
 29 | #endif
 30 | 
 31 | #include <math.h>
 32 | #include <stdio.h>
 33 | #include <stdlib.h>
 34 | #include <unistd.h>
 35 | 
 36 | #define handle_error(msg) \
 37 |   do { perror(msg); exit(EXIT_FAILURE); } while (0)
 38 | 
 39 | #define IDX(i,j,d) (((i)*(d))+(j))
 40 | 
 41 | 
 42 | int nstates = 0;                /* number of states */
 43 | int nobvs = 0;                  /* number of observations */
 44 | int nseq = 0;                   /* number of data sequences  */
 45 | int length = 0;                 /* data sequencel length */
 46 | double *prior = NULL;           /* initial state probabilities */
 47 | double *trans = NULL;           /* state transition probabilities */
 48 | double *obvs = NULL;            /* output probabilities */
 49 | int *data = NULL;
 50 | double *gmm = NULL;             /* gamma */
 51 | double *xi = NULL;              /* xi */
 52 | double *pi = NULL;              /* pi */
 53 | 
 54 | double logadd(double, double);
 55 | double sum(double *, int);
 56 | double forward_backward(int *, size_t, int);
 57 | void viterbi(int *, size_t);
 58 | void init_count();
 59 | void update_prob();
 60 | void usage();
 61 | void freeall();
 62 | 
 63 | int main(int argc, char *argv[])
 64 | {
 65 |   char *configfile = NULL;
 66 |   FILE *fin, *bin;
 67 | 
 68 |   char *linebuf = NULL;
 69 |   size_t buflen = 0;
 70 | 
 71 |   int iterations = 3;
 72 |   int mode = 3;
 73 | 
 74 |   int c;
 75 |   double d;
 76 |   double *loglik;
 77 |   double p;
 78 |   int i, j, k;
 79 |   opterr = 0;
 80 | 
 81 | 
 82 |   while ((c = getopt(argc, argv, "c:n:hp:")) != -1) {
 83 |     switch (c) {
 84 |     case 'c':
 85 |       configfile = optarg;
 86 |       break;
 87 |     case 'h':
 88 |       usage();
 89 |       exit(EXIT_SUCCESS);
 90 |     case 'n':
 91 |       iterations = atoi(optarg);
 92 |       break;
 93 |     case 'p':
 94 |       mode = atoi(optarg);
 95 |       if (mode != 1 && mode != 2 && mode != 3) {
 96 |         fprintf(stderr, "illegal mode: %d\n", mode);
 97 |         exit(EXIT_FAILURE);
 98 |       }
 99 |       break;
100 |     case '?':
101 |       fprintf(stderr, "illegal options\n");
102 |       exit(EXIT_FAILURE);
103 |     default:
104 |       abort();
105 |     }
106 |   }
107 | 
108 |   if (configfile == NULL) {
109 |     fin = stdin;
110 |   } else {
111 |     fin = fopen(configfile, "r");
112 |     if (fin == NULL) {
113 |       handle_error("fopen");
114 |     }
115 |   }
116 |   
117 |   i = 0;
118 |   while ((c = getline(&linebuf, &buflen, fin)) != -1) {
119 |     if (c <= 1 || linebuf[0] == '#')
120 |       continue;
121 |     
122 |     if (i == 0) {
123 |       if (sscanf(linebuf, "%d", &nstates) != 1) {
124 |         fprintf(stderr, "config file format error: %d\n", i);
125 |         freeall();
126 |         exit(EXIT_FAILURE);
127 |       }
128 | 
129 |       prior = (double *) malloc(sizeof(double) * nstates);
130 |       if (prior == NULL) handle_error("malloc");
131 | 
132 |       trans = (double *) malloc(sizeof(double) * nstates * nstates);
133 |       if (trans == NULL) handle_error("malloc");
134 | 
135 |       xi = (double *) malloc(sizeof(double) * nstates * nstates);
136 |       if (xi == NULL) handle_error("malloc");
137 | 
138 |       pi = (double *) malloc(sizeof(double) * nstates);
139 |       if (pi == NULL) handle_error("malloc");
140 | 
141 |     } else if (i == 1) {
142 |       if (sscanf(linebuf, "%d", &nobvs) != 1) {
143 |         fprintf(stderr, "config file format error: %d\n", i);
144 |         freeall();
145 |         exit(EXIT_FAILURE);
146 |       }
147 | 
148 |       obvs = (double *) malloc(sizeof(double) * nstates * nobvs);
149 |       if (obvs == NULL) handle_error("malloc");
150 | 
151 |       gmm = (double *) malloc(sizeof(double) * nstates * nobvs);
152 |       if (gmm == NULL) handle_error("malloc");
153 | 
154 |     } else if (i == 2) {
155 |       /* read initial state probabilities */ 
156 |       bin = fmemopen(linebuf, buflen, "r");
157 |       if (bin == NULL) handle_error("fmemopen");
158 |       for (j = 0; j < nstates; j++) {
159 |         if (fscanf(bin, "%lf", &d) != 1) {
160 |           fprintf(stderr, "config file format error: %d\n", i);
161 |           freeall();
162 |           exit(EXIT_FAILURE);
163 |         }
164 |         prior[j] = log(d);
165 |       }
166 |       fclose(bin);
167 | 
168 |     } else if (i <= 2 + nstates) {
169 |       /* read state transition  probabilities */ 
170 |       bin = fmemopen(linebuf, buflen, "r");
171 |       if (bin == NULL) handle_error("fmemopen");
172 |       for (j = 0; j < nstates; j++) {
173 |         if (fscanf(bin, "%lf", &d) != 1) {
174 |           fprintf(stderr, "config file format error: %d\n", i);
175 |           freeall();
176 |           exit(EXIT_FAILURE);
177 |         }
178 |         trans[IDX((i - 3),j, nstates)] = log(d);
179 |       }
180 |       fclose(bin);
181 |     } else if (i <= 2 + nstates * 2) {
182 |       /* read output probabilities */
183 |       bin = fmemopen(linebuf, buflen, "r");
184 |       if (bin == NULL) handle_error("fmemopen");
185 |       for (j = 0; j < nobvs; j++) {
186 |         if (fscanf(bin, "%lf", &d) != 1) {
187 |           fprintf(stderr, "config file format error: %d\n", i);
188 |           freeall();
189 |           exit(EXIT_FAILURE);
190 |         }
191 |         obvs[IDX((i - 3 - nstates),j,nobvs)] = log(d);
192 |       }
193 |       fclose(bin);
194 |     } else if (i == 3 + nstates * 2) {
195 |       if (sscanf(linebuf, "%d %d", &nseq, &length) != 2) {
196 |         fprintf(stderr, "config file format error: %d\n", i);
197 |         freeall();
198 |         exit(EXIT_FAILURE);
199 |       }
200 |       data = (int *) malloc (sizeof(int) * nseq * length);
201 |       if (data == NULL) handle_error("malloc");
202 |     } else if (i <= 3 + nstates * 2 + nseq) {
203 |       /* read data */
204 |       bin = fmemopen(linebuf, buflen, "r");
205 |       if (bin == NULL) handle_error("fmemopen");
206 |       for (j = 0; j < length; j++) {
207 |         if (fscanf(bin, "%d", &k) != 1 || k < 0 || k >= nobvs) {
208 |           fprintf(stderr, "config file format error: %d\n", i);
209 |           freeall();
210 |           exit(EXIT_FAILURE);
211 |         }
212 |         data[(i - 4 - nstates * 2) * length + j] = k;
213 |       }
214 |       fclose(bin);
215 |     }
216 | 
217 |     i++;
218 |   }
219 |   fclose(fin);
220 |   if (linebuf) free(linebuf);
221 | 
222 |   if (i < 4 + nstates * 2 + nseq) {
223 |     fprintf(stderr, "configuration incomplete.\n");
224 |     freeall();
225 |     exit(EXIT_FAILURE);
226 |   }
227 | 
228 |   if (mode == 3) {
229 |     loglik = (double *) malloc(sizeof(double) * nseq);
230 |     if (loglik == NULL) handle_error("malloc");
231 |     for (i = 0; i < iterations; i++) {
232 |       init_count();
233 |       for (j = 0; j < nseq; j++) {
234 |         loglik[j] = forward_backward(data + length * j, length, 1);
235 |       }
236 |       p = sum(loglik, nseq);
237 | 
238 |       update_prob();
239 | 
240 |       printf("iteration %d log-likelihood: %.4lf\n", i + 1, p);
241 |       printf("updated parameters:\n");
242 |       printf("# initial state probability\n");
243 |       for (j = 0; j < nstates; j++) {
244 |         printf(" %.4f", exp(prior[j]));
245 |       }
246 |       printf("\n");
247 |       printf("# state transition probability\n");
248 |       for (j = 0; j < nstates; j++) {
249 |         for (k = 0; k < nstates; k++) {
250 |           printf(" %.4f", exp(trans[IDX(j,k,nstates)]));
251 |         }
252 |         printf("\n");
253 |       }
254 |       printf("# state output probility\n");
255 |       for (j = 0; j < nstates; j++) {
256 |         for (k = 0; k < nobvs; k++) {
257 |           printf(" %.4f", exp(obvs[IDX(j,k,nobvs)]));
258 |         }
259 |         printf("\n");
260 |       }
261 |       printf("\n");
262 |     }
263 |     free(loglik);
264 |   } else if (mode == 2) {
265 |     for (i = 0; i < nseq; i++) {
266 |       viterbi(data + length * i, length);
267 |     }
268 |   } else if (mode == 1) {
269 |     loglik = (double *) malloc(sizeof(double) * nseq);
270 |     if (loglik == NULL) handle_error("malloc");
271 |     for (i = 0; i < nseq; i++) {
272 |       loglik[i] = forward_backward(data + length * i, length, 0);
273 |     }
274 |     p = sum(loglik, nseq);
275 |     for (i = 0; i < nseq; i++)
276 |       printf("%.4lf\n", loglik[i]);
277 |     printf("total: %.4lf\n", p);
278 |     free(loglik);
279 |   }
280 | 
281 |   freeall();
282 |   return 0;
283 | }
284 | 
285 | /* compute sum of the array using Kahan summation algorithm */
286 | double sum(double *data, int size)
287 | {
288 |   double sum = data[0];
289 |   int i;
290 |   double y, t;
291 |   double c = 0.0;
292 |   for (i = 1; i < size; i++) {
293 |     y = data[i] - c;
294 |     t = sum + y;
295 |     c = (t - sum) - y;
296 |     sum = t;
297 |   }
298 |   return sum;
299 | }
300 | 
301 | /* initilize counts */
302 | void init_count() {
303 |   size_t i;
304 |   for (i = 0; i < nstates * nobvs; i++)
305 |     gmm[i] = - INFINITY;
306 | 
307 |   for (i = 0; i < nstates * nstates; i++)
308 |     xi[i] = - INFINITY;
309 | 
310 |   for (i = 0; i < nstates; i++)
311 |     pi[i] = - INFINITY;
312 | }
313 | 
314 | void update_prob() {
315 |   double pisum = - INFINITY;
316 |   double gmmsum[nstates];
317 |   double xisum[nstates];
318 |   size_t i, j;
319 | 
320 |   for (i = 0; i < nstates; i++) {
321 |     gmmsum[i] = - INFINITY;
322 |     xisum[i] = - INFINITY;
323 | 
324 |     pisum = logadd(pi[i], pisum);
325 |   }
326 | 
327 |   for (i = 0; i < nstates; i++) {
328 |     prior[i] = pi[i] - pisum;
329 |   }
330 | 
331 |   for (i = 0; i < nstates; i++) {
332 |     for (j = 0; j < nstates; j++) {
333 |       xisum[i] = logadd(xisum[i], xi[IDX(i,j,nstates)]);
334 |     }
335 |     for (j = 0; j < nobvs; j++) {
336 |       gmmsum[i] = logadd(gmmsum[i], gmm[IDX(i,j,nobvs)]);
337 |     }
338 |   }
339 | 
340 |   for (i = 0; i < nstates; i++) {
341 |     for (j = 0; j < nstates; j++) {
342 |       trans[IDX(i,j,nstates)] = xi[IDX(i,j,nstates)] - xisum[i];
343 |     }
344 |     for (j = 0; j < nobvs; j++) {
345 |       obvs[IDX(i,j,nobvs)] = gmm[IDX(i,j,nobvs)] - gmmsum[i];
346 |     }
347 |   }
348 | 
349 | }
350 | 
351 | /* forward backward algoritm: return observation likelihood */
352 | double forward_backward(int *data, size_t len, int backward)
353 | {
354 |   /* construct trellis */
355 |   double alpha[len][nstates];
356 |   double beta[len][nstates];
357 | 
358 |   size_t i, j, k;
359 |   double p, e;
360 |   double loglik;
361 | 
362 |   for (i = 0; i < len; i++) {
363 |     for (j = 0; j < nstates; j++) {
364 |       alpha[i][j] = - INFINITY;
365 |       beta[i][j] = - INFINITY;
366 |     }
367 |   }
368 | 
369 |   /* forward pass */
370 |   for (i = 0; i < nstates; i++) {
371 |     alpha[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)];
372 |   }
373 |   for (i = 1; i < len; i++) {
374 |     for (j = 0; j < nstates; j++) {
375 |       for (k = 0; k < nstates; k++) {
376 |         p = alpha[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)];
377 |         alpha[i][j] = logadd(alpha[i][j], p);
378 |       }
379 |     }
380 |   }
381 |   loglik = -INFINITY;
382 |   for (i = 0; i < nstates; i++) {
383 |     loglik = logadd(loglik, alpha[len-1][i]);
384 |   }
385 | 
386 |   if (! backward)
387 |     return loglik;
388 | 
389 |   /* backward pass & update counts */
390 |   for (i = 0; i < nstates; i++) {
391 |     beta[len-1][i] = 0;         /* 0 = log (1.0) */
392 |   }
393 |   for (i = 1; i < len; i++) {
394 |     for (j = 0; j < nstates; j++) {
395 | 
396 |       e = alpha[len-i][j] + beta[len-i][j] - loglik;
397 |       gmm[IDX(j,data[len-i],nobvs)] = logadd(gmm[IDX(j,data[len-i],nobvs)], e);
398 | 
399 |       for (k = 0; k < nstates; k++) {
400 |         p = beta[len-i][k] + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)];
401 |         beta[len-1-i][j] = logadd(beta[len-1-i][j], p);
402 | 
403 |         e = alpha[len-1-i][j] + beta[len-i][k]
404 |           + trans[IDX(j,k,nstates)] + obvs[IDX(k,data[len-i],nobvs)] - loglik;
405 |         xi[IDX(j,k,nstates)] = logadd(xi[IDX(j,k,nstates)], e);
406 |       }
407 |     }
408 |   }
409 |   p = -INFINITY;
410 |   for (i = 0; i < nstates; i++) {
411 |     p = logadd(p, prior[i] + beta[0][i] + obvs[IDX(i,data[0],nobvs)]);
412 | 
413 |     e = alpha[0][i] + beta[0][i] - loglik;
414 |     gmm[IDX(i,data[0],nobvs)] = logadd(gmm[IDX(i,data[0],nobvs)], e);
415 | 
416 |     pi[i] = logadd(pi[i], e);
417 |   }
418 | 
419 | #ifdef DEBUG
420 |   /* verify if forward prob == backward prob */
421 |   if (fabs(p - loglik) > 1e-5) {
422 |     fprintf(stderr, "Error: forward and backward incompatible: %lf, %lf\n", loglik, p);
423 |   }
424 | #endif
425 | 
426 |   return loglik;
427 | }
428 | 
429 | /* find the most probable sequence */
430 | void viterbi(int *data, size_t len)
431 | {
432 |   double lambda[len][nstates];
433 |   int backtrace[len][nstates];
434 |   int stack[len];
435 | 
436 |   size_t i, j, k;
437 |   double p;
438 | 
439 |   for (i = 0; i < len; i++) {
440 |     for (j = 0; j < nstates; j++) {
441 |       lambda[i][j] = - INFINITY;
442 |     }
443 |   }
444 | 
445 |   for (i = 0; i < nstates; i++) {
446 |     lambda[0][i] = prior[i] + obvs[IDX(i,data[0],nobvs)];
447 |     backtrace[0][i] = -1;       /* -1 is starting point */
448 |   }
449 |   for (i = 1; i < len; i++) {
450 |     for (j = 0; j < nstates; j++) {
451 |       for (k = 0; k < nstates; k++) {
452 |         p = lambda[i-1][k] + trans[IDX(k,j,nstates)] + obvs[IDX(j,data[i],nobvs)];
453 |         if (p > lambda[i][j]) {
454 |           lambda[i][j] = p;
455 |           backtrace[i][j] = k;
456 |         }
457 |       }
458 |     }
459 |   }
460 | 
461 |   /* backtrace */
462 |   for (i = 0; i < nstates; i++) {
463 |     if (i == 0 || lambda[len-1][i] > p) {
464 |       p = lambda[len-1][i];
465 |       k = i;
466 |     }
467 |   }
468 |   stack[len - 1] = k;
469 |   for (i = 1; i < len; i++) {
470 |     stack[len - 1 - i] = backtrace[len - i][stack[len - i]];
471 |   }
472 |   for (i = 0; i < len; i++) {
473 |     printf("%d ", stack[i]);
474 |   }
475 |   printf("\n");
476 | }
477 | 
478 | double logadd(double x, double y) {
479 |   if (y <= x)
480 |     return x + log1p(exp(y - x));
481 |   else
482 |     return y + log1p(exp(x - y));
483 | }
484 | 
485 | void usage() {
486 |   fprintf(stdout, "hmm [-hnt] [-c config] [-p(1|2|3)]\n");
487 |   fprintf(stdout, "usage:\n");
488 |   fprintf(stdout, "  -h   help\n");
489 |   fprintf(stdout, "  -c   configuration file\n");
490 |   fprintf(stdout, "  -t   output computation time\n");
491 |   fprintf(stdout, "  -p1  compute the probability of the observation sequence\n");
492 |   fprintf(stdout, "  -p2  compute the most probable sequence (Viterbi)\n");
493 |   fprintf(stdout, "  -p3  train hidden Markov mode parameters (Baum-Welch)\n");
494 |   fprintf(stdout, "  -n   number of iterations\n");
495 | }
496 | 
497 | /* free all memory */
498 | void freeall() {
499 |   if (trans) free(trans);
500 |   if (obvs) free(obvs);
501 |   if (prior) free(prior);
502 |   if (data) free(data);
503 |   if (gmm) free(gmm);
504 |   if (xi) free(xi);
505 |   if (pi) free(pi);
506 | }
507 | 


--------------------------------------------------------------------------------
/hmm.cu:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright (c) 2009, Chuan Liu <chuan@cs.jhu.edu>
   3 |  *
   4 |  * Permission is hereby granted, free of charge, to any person
   5 |  * obtaining a copy of this software and associated documentation
   6 |  * files (the "Software"), to deal in the Software without
   7 |  * restriction, including without limitation the rights to use, copy,
   8 |  * modify, merge, publish, distribute, sublicense, and/or sell copies
   9 |  * of the Software, and to permit persons to whom the Software is
  10 |  * furnished to do so, subject to the following conditions:
  11 |  *
  12 |  * The above copyright notice and this permission notice shall be
  13 |  * included in all copies or substantial portions of the Software.
  14 |  *
  15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19 |  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20 |  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 |  * SOFTWARE.
  23 |  *
  24 |  */
  25 | 
  26 | #ifndef _GNU_SOURCE
  27 | #define _GNU_SOURCE
  28 | #endif
  29 | 
  30 | #include <math.h>
  31 | #include <stdio.h>
  32 | #include <stdlib.h>
  33 | #include <unistd.h>
  34 | 
  35 | #include <cuda.h>
  36 | #include <helper_cuda.h>
  37 | #include <helper_timer.h>
  38 | 
  39 | 
  40 | #define handle_error(msg) \
  41 |   do { perror(msg); exit(EXIT_FAILURE); } while (0)
  42 | 
  43 | #define IDX(i,j,d) (((i)*(d))+(j))
  44 | 
  45 | enum {
  46 |   BLOCK_SIZE = 16,
  47 |   NUM_THREADS = 256,
  48 |   IN_DEVICE = 1,
  49 |   IN_HOST = 0,
  50 | };
  51 | 
  52 | int nstates = 0;                /* number of states */
  53 | int nobvs = 0;                  /* number of observations */
  54 | int nseq = 0;                   /* number of data sequences  */
  55 | int length = 0;                 /* data sequencel length */
  56 | float *prior = NULL;            /* initial state probabilities */
  57 | float *trans = NULL;            /* state transition probabilities */
  58 | float *obvs = NULL;             /* output probabilities */
  59 | int *data = NULL;               /* observations */
  60 | float *transd = NULL;           /* trans in device memory */
  61 | float *obvsd = NULL;            /* obvs in device memory */
  62 | float *gmmd = NULL;             /* gamma in device memory */
  63 | float *xid = NULL;              /* xi in device memory */
  64 | float *pid = NULL;              /* pi in device memory */
  65 | 
  66 | #ifdef PROFILE_GPU
  67 | StopWatchInterface *gpu_timer;
  68 | double gpu_flop;
  69 | float gpu_time;
  70 | #endif
  71 | 
  72 | #ifdef PROFILE_PG
  73 | StopWatchInterface *pg_timer;
  74 | #endif
  75 | 
  76 | /* function called in main fuction */
  77 | void usage();
  78 | void freeall();
  79 | void init_count();
  80 | float forward_backward(int backward);
  81 | void viterbi();
  82 | void update_prob();
  83 | 
  84 | /* utility functions */
  85 | float logadd(float, float);
  86 | __device__ float logaddd(float, float);
  87 | float sumf(float *, int, int);
  88 | float logsumf(float *, int, int);
  89 | 
  90 | 
  91 | int main(int argc, char *argv[])
  92 | {
  93 |   char *configfile = NULL;
  94 |   FILE *fin, *bin;
  95 | 
  96 |   char *linebuf = NULL;
  97 |   size_t buflen = 0;
  98 | 
  99 |   int iterations = 1;
 100 |   int mode = 3;
 101 | 
 102 |   int c;
 103 |   float d;
 104 | 
 105 |   int i, j, k;
 106 |   opterr = 0;
 107 | 
 108 | 
 109 |   while ((c = getopt(argc, argv, "c:n:hp:")) != -1) {
 110 |     switch (c) {
 111 |     case 'c':
 112 |       configfile = optarg;
 113 |       break;
 114 |     case 'h':
 115 |       usage();
 116 |       exit(EXIT_SUCCESS);
 117 |     case 'n':
 118 |       iterations = atoi(optarg);
 119 |       break;
 120 |     case 'p':
 121 |       mode = atoi(optarg);
 122 |       if (mode != 1 && mode != 2 && mode != 3) {
 123 |         fprintf(stderr, "illegal mode: %d\n", mode);
 124 |         exit(EXIT_FAILURE);
 125 |       }
 126 |       break;
 127 |     case '?':
 128 |       fprintf(stderr, "illegal options\n");
 129 |       exit(EXIT_FAILURE);
 130 |     default:
 131 |       abort();
 132 |     }
 133 |   }
 134 | 
 135 |   if (configfile == NULL) {
 136 |     fin = stdin;
 137 |   } else {
 138 |     fin = fopen(configfile, "r");
 139 |     if (fin == NULL) {
 140 |       handle_error("fopen");
 141 |     }
 142 |   }
 143 |   
 144 |   i = 0;
 145 |   while ((c = getline(&linebuf, &buflen, fin)) != -1) {
 146 |     if (c <= 1 || linebuf[0] == '#')
 147 |       continue;
 148 |     
 149 |     if (i == 0) {
 150 |       if (sscanf(linebuf, "%d", &nstates) != 1) {
 151 |         fprintf(stderr, "config file format error: %d\n", i);
 152 |         freeall();
 153 |         exit(EXIT_FAILURE);
 154 |       }
 155 | 
 156 |       prior = (float *) malloc(sizeof(float) * nstates);
 157 |       if (prior == NULL) handle_error("malloc");
 158 | 
 159 |       trans = (float *) malloc(sizeof(float) * nstates * nstates);
 160 |       if (trans == NULL) handle_error("malloc");
 161 | 
 162 |     } else if (i == 1) {
 163 |       if (sscanf(linebuf, "%d", &nobvs) != 1) {
 164 |         fprintf(stderr, "config file format error: %d\n", i);
 165 |         freeall();
 166 |         exit(EXIT_FAILURE);
 167 |       }
 168 | 
 169 |       obvs = (float *) malloc(sizeof(float) * nstates * nobvs);
 170 |       if (obvs == NULL) handle_error("malloc");
 171 | 
 172 |     } else if (i == 2) {
 173 |       /* read initial state probabilities */ 
 174 |       bin = fmemopen(linebuf, buflen, "r");
 175 |       if (bin == NULL) handle_error("fmemopen");
 176 |       for (j = 0; j < nstates; j++) {
 177 |         if (fscanf(bin, "%f", &d) != 1) {
 178 |           fprintf(stderr, "config file format error: %d\n", i);
 179 |           freeall();
 180 |           exit(EXIT_FAILURE);
 181 |         }
 182 |         prior[j] = logf(d);
 183 |       }
 184 |       fclose(bin);
 185 | 
 186 |     } else if (i <= 2 + nstates) {
 187 |       /* read state transition  probabilities */ 
 188 |       bin = fmemopen(linebuf, buflen, "r");
 189 |       if (bin == NULL) handle_error("fmemopen");
 190 |       for (j = 0; j < nstates; j++) {
 191 |         if (fscanf(bin, "%f", &d) != 1) {
 192 |           fprintf(stderr, "config file format error: %d\n", i);
 193 |           freeall();
 194 |           exit(EXIT_FAILURE);
 195 |         }
 196 |         trans[IDX((i - 3), j, nstates)] = logf(d);
 197 |       }
 198 |       fclose(bin);
 199 |     } else if (i <= 2 + nstates * 2) {
 200 |       /* read output probabilities */
 201 |       bin = fmemopen(linebuf, buflen, "r");
 202 |       if (bin == NULL) handle_error("fmemopen");
 203 |       for (j = 0; j < nobvs; j++) {
 204 |         if (fscanf(bin, "%f", &d) != 1) {
 205 |           fprintf(stderr, "config file format error: %d\n", i);
 206 |           freeall();
 207 |           exit(EXIT_FAILURE);
 208 |         }
 209 |         obvs[IDX((i - 3 - nstates), j, nobvs)] = logf(d);
 210 |       }
 211 |       fclose(bin);
 212 |     } else if (i == 3 + nstates * 2) {
 213 |       if (sscanf(linebuf, "%d %d", &nseq, &length) != 2) {
 214 |         fprintf(stderr, "config file format error: %d\n", i);
 215 |         freeall();
 216 |         exit(EXIT_FAILURE);
 217 |       }
 218 |       data = (int *) malloc (sizeof(int) * nseq * length);
 219 |       if (data == NULL) handle_error("malloc");
 220 |     } else if (i <= 3 + nstates * 2 + nseq) {
 221 |       /* read data */
 222 |       bin = fmemopen(linebuf, buflen, "r");
 223 |       if (bin == NULL) handle_error("fmemopen");
 224 |       for (j = 0; j < length; j++) {
 225 |         if (fscanf(bin, "%d", &k) != 1 || k < 0 || k >= nobvs) {
 226 |           fprintf(stderr, "config file format error: %d\n", i);
 227 |           freeall();
 228 |           exit(EXIT_FAILURE);
 229 |         }
 230 |         data[j * nseq + (i - 4 - nstates * 2)] = k;
 231 |       }
 232 |       fclose(bin);
 233 |     }
 234 | 
 235 |     i++;
 236 |   }
 237 |   fclose(fin);
 238 |   if (linebuf) free(linebuf);
 239 | 
 240 |   if (i < 4 + nstates * 2 + nseq) {
 241 |     fprintf(stderr, "configuration incomplete.\n");
 242 |     freeall();
 243 |     exit(EXIT_FAILURE);
 244 |   }
 245 | 
 246 |   
 247 |   /* initial cuda device memory */
 248 |   c = sizeof(float) * nstates * nstates;
 249 |   checkCudaErrors( cudaMalloc((void**)&transd, c) );
 250 |   checkCudaErrors( cudaMemcpy(transd, trans, c, cudaMemcpyHostToDevice) );
 251 |   
 252 |   c = sizeof(float) * nstates * nobvs;
 253 |   checkCudaErrors( cudaMalloc((void**)&obvsd, c) );
 254 |   checkCudaErrors( cudaMemcpy(obvsd, obvs, c, cudaMemcpyHostToDevice) );
 255 | 
 256 | #ifdef PROFILE_GPU
 257 |   gpu_time = 0;
 258 |   gpu_flop = 0;
 259 | #endif
 260 | 
 261 | #ifdef PROFILE_PG
 262 |   sdkCreateTimer( &pg_timer );
 263 |   sdkStartTimer(&pg_timer);
 264 | #endif
 265 | 
 266 |   if (mode == 3) {
 267 |     /* estimating parameters using Baum-Welch algorithm */
 268 |     for (i = 0; i < iterations; i++) {
 269 |       init_count();
 270 |       d = forward_backward(1);
 271 |       update_prob();
 272 | 
 273 | #ifdef PROFILE_PG
 274 |       sdkStopTimer(&pg_timer);
 275 | #endif
 276 | 
 277 |       printf("iteration %d log-likelihood: %.4f\n", i + 1, d);
 278 |       printf("updated parameters:\n");
 279 |       printf("# initial state probability\n");
 280 |       for (j = 0; j < nstates; j++) {
 281 |         printf(" %.4f", exp(prior[j]));
 282 |       }
 283 |       printf("\n");
 284 |       printf("# state transition probability\n");
 285 |       for (j = 0; j < nstates; j++) {
 286 |         for (k = 0; k < nstates; k++) {
 287 |           printf(" %.4f", exp(trans[IDX(j,k,nstates)]));
 288 |         }
 289 |         printf("\n");
 290 |       }
 291 |       printf("# state output probility\n");
 292 |       for (j = 0; j < nstates; j++) {
 293 |         for (k = 0; k < nobvs; k++) {
 294 |           printf(" %.4f", exp(obvs[IDX(j,k,nobvs)]));
 295 |         }
 296 |         printf("\n");
 297 |       }
 298 |       printf("\n");
 299 |     }
 300 | 
 301 | #ifdef PROFILE_PG
 302 |       sdkStartTimer(&pg_timer);
 303 | #endif
 304 | 
 305 |   } else if (mode == 1) {
 306 |     /* compute forward probabilities */
 307 |     forward_backward(0);
 308 |   } else if (mode == 2) {
 309 |     /* find most likely path using Viterbi algorithm */
 310 |     viterbi();
 311 |   }
 312 |  
 313 |   freeall();
 314 | 
 315 | #ifdef PROFILE_PG
 316 |    sdkStopTimer(&pg_timer);
 317 |   printf("Programming running time (in Ms): %f\n", sdkGetTimerValue(&pg_timer));
 318 |   sdkDeleteTimer( &pg_timer);
 319 | #endif
 320 | 
 321 | #ifdef PROFILE_GPU
 322 |   printf("GPU time (in Ms): %f\n", gpu_time);
 323 |   printf("GFLOPS: %lf\n", gpu_flop / gpu_time);
 324 | #endif
 325 |   return 0;
 326 | }
 327 | 
 328 | 
 329 | /* kernel function copied from NVIDIA CUDA SDK */
 330 | __global__ void
 331 | reduce2(float *g_idata, float *g_odata)
 332 | {
 333 |   __shared__ float sdata[NUM_THREADS];
 334 | 
 335 |   /* load shared mem */
 336 |   unsigned int tid = threadIdx.x;
 337 |   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 338 |   sdata[tid] = g_idata[i];
 339 |   __syncthreads();
 340 | 
 341 |   /* do reduction in shared mem */
 342 |   for(unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 
 343 |     {
 344 |       if (tid < s) 
 345 |         {
 346 |           sdata[tid] += sdata[tid + s];
 347 |         }
 348 |       __syncthreads();
 349 |     }
 350 | 
 351 |   /* write result for this block to global mem */
 352 |   if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 353 | }
 354 | 
 355 | /* calculate the sum of the array of n floats by calling a reduce
 356 |    kernel recursively. indevce indicates whether the array points to
 357 |    memroy in device or not, i.e. indevice = 0 means the array is in
 358 |    main memory. */
 359 | float sumf(float *array, int n, int indevice)
 360 | {
 361 |   int i;
 362 |   int num_blocks;
 363 |   int remains;
 364 |   int size;
 365 |   float sum = 0.0;
 366 |   float *gin;
 367 | 
 368 |   /* NUM_THREADS also serves as CPU threshold */
 369 |   if (n < NUM_THREADS) {
 370 |     if (indevice == 0) {
 371 |       for (i = 0; i < n; i++) {
 372 |         sum += array[i];
 373 |       }
 374 |     } else {
 375 |       float gout[n];
 376 |       size = sizeof(float) * n;
 377 |       checkCudaErrors( cudaMemcpy(gout, array, size, cudaMemcpyDeviceToHost) );
 378 |       for (i = 0; i < n; i++) {
 379 |         sum += gout[i];
 380 |       }
 381 |     }
 382 |   } else {
 383 | 
 384 |     num_blocks = n / NUM_THREADS;
 385 |     remains = n - num_blocks * NUM_THREADS;
 386 | 
 387 |     dim3 dimBlock(NUM_THREADS);
 388 |     dim3 dimGrid(num_blocks);
 389 | 
 390 |     if (indevice == 0) {
 391 | 
 392 |       size = sizeof(float) * num_blocks * NUM_THREADS;
 393 |       checkCudaErrors( cudaMalloc((void**) &gin, size) );
 394 |       checkCudaErrors( cudaMemcpy(gin, array, size, cudaMemcpyHostToDevice) );
 395 |     
 396 |       reduce2<<<dimGrid, dimBlock>>>(gin, gin);
 397 | 
 398 |       sum += sumf(gin, num_blocks, 1);
 399 | 
 400 |       if (remains > 0)
 401 |         sum += sumf(array + num_blocks * NUM_THREADS, remains, 0);
 402 | 
 403 |       checkCudaErrors( cudaFree(gin) );
 404 | 
 405 |     } else {
 406 |       reduce2<<<dimGrid, dimBlock>>>(gin, gin);
 407 |       sum += sumf(gin, num_blocks, 1);
 408 |       
 409 |       if (remains > 0)
 410 |         sum += sumf(gin + num_blocks * NUM_THREADS, remains, 1);
 411 |     }
 412 |   }
 413 |   return sum;
 414 | }
 415 | 
 416 | /* logarithm version of the reduce kernel function */
 417 | __global__ void
 418 | logreduce2(float *g_idata, float *g_odata)
 419 | {
 420 |   __shared__ float sdata[NUM_THREADS];
 421 | 
 422 |   /* load shared mem */
 423 |   unsigned int tid = threadIdx.x;
 424 |   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 425 |   sdata[tid] = g_idata[i];
 426 |   __syncthreads();
 427 | 
 428 |   /* do reduction in shared mem */
 429 |   for(unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
 430 |     if (tid < s) {
 431 |       sdata[tid] = logaddd(sdata[tid], sdata[tid + s]);
 432 |     }
 433 |     __syncthreads();
 434 |   }
 435 | 
 436 |   /* write result for this block to global mem */
 437 |   if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 438 | }
 439 | 
 440 | /* logarithm version of the sumf function */
 441 | float logsumf(float *array, int n, int indevice)
 442 | {
 443 |   int i;
 444 |   int num_blocks;
 445 |   int remains;
 446 |   int size;
 447 |   float sum = - INFINITY;
 448 |   float *gin;
 449 | 
 450 |   /* NUM_THREADS also serves as CPU threshold */
 451 |   if (n < NUM_THREADS) {
 452 |     if (indevice == 0) {
 453 |       for (i = 0; i < n; i++) {
 454 |         sum = logadd(sum, array[i]);
 455 |       }
 456 |     } else {
 457 |       float gout[n];
 458 |       size = sizeof(float) * n;
 459 |       checkCudaErrors( cudaMemcpy(gout, array, size, cudaMemcpyDeviceToHost) );
 460 |       for (i = 0; i < n; i++) {
 461 |         sum = logadd(sum, gout[i]);
 462 |       }
 463 |     }
 464 |   } else {
 465 | 
 466 |     num_blocks = n / NUM_THREADS;
 467 |     remains = n - num_blocks * NUM_THREADS;
 468 | 
 469 |     dim3 dimBlock(NUM_THREADS);
 470 |     dim3 dimGrid(num_blocks);
 471 | 
 472 |     if (! indevice) {
 473 | 
 474 |       size = sizeof(float) * num_blocks * NUM_THREADS;
 475 |       checkCudaErrors( cudaMalloc((void**) &gin, size) );
 476 |       checkCudaErrors( cudaMemcpy(gin, array, size, cudaMemcpyHostToDevice) );
 477 |     
 478 |       logreduce2<<<dimGrid, dimBlock>>>(gin, gin);
 479 | 
 480 |       sum = logadd(sum, logsumf(gin, num_blocks, IN_DEVICE));
 481 | 
 482 |       if (remains > 0)
 483 |         sum = logadd(sum, logsumf(array + num_blocks * NUM_THREADS, remains, IN_HOST));
 484 | 
 485 |       checkCudaErrors( cudaFree(gin) );
 486 | 
 487 |     } else {
 488 |       logreduce2<<<dimGrid, dimBlock>>>(gin, gin);
 489 |       sum = logadd(sum, logsumf(gin, num_blocks, IN_DEVICE));
 490 | 
 491 |       if (remains > 0)
 492 |         sum = logadd(sum, logsumf(gin + num_blocks * NUM_THREADS, remains, IN_DEVICE));
 493 |     }
 494 |   }
 495 |   return sum;
 496 | }
 497 | 
 498 | /* initilize counts */
 499 | void init_count() {
 500 |   int size;
 501 |   size_t i;
 502 |   float pi[nstates];
 503 |   float gmm[nstates * nobvs];
 504 |   float xi[nstates * nstates];
 505 | 
 506 |   for (i = 0; i < nstates * nobvs; i++)
 507 |     gmm[i] = - INFINITY;
 508 | 
 509 |   for (i = 0; i < nstates * nstates; i++)
 510 |     xi[i] = - INFINITY;
 511 | 
 512 |   for (i = 0; i < nstates; i++)
 513 |     pi[i] = - INFINITY;
 514 | 
 515 |   size = sizeof(float) * nstates * nstates;
 516 |   checkCudaErrors( cudaMalloc((void**)&xid, size) );
 517 |   checkCudaErrors( cudaMemcpy(xid, xi, size, cudaMemcpyHostToDevice) );
 518 |   
 519 |   size = sizeof(float) * nstates * nobvs;
 520 |   checkCudaErrors( cudaMalloc((void**)&gmmd, size) );
 521 |   checkCudaErrors( cudaMemcpy(gmmd, gmm, size, cudaMemcpyHostToDevice) );
 522 | 
 523 |   size = sizeof(float) * nstates;
 524 |   checkCudaErrors( cudaMalloc((void**)&pid, size) );
 525 |   checkCudaErrors( cudaMemcpy(pid, pi, size, cudaMemcpyHostToDevice) );
 526 | }
 527 | 
 528 | /* add up two logarithm while avoiding overflow */
 529 | float logadd(float x, float y) {
 530 |   if (y <= x)
 531 |     return x + log1pf(expf(y - x));
 532 |   else
 533 |     return y + log1pf(expf(x - y));
 534 | }
 535 | 
 536 | /* add up two logarithm while avoiding overflow (device version) */
 537 | __device__ float logaddd(float x, float y) {
 538 |   if (y <= x)
 539 |     return x + log1pf(expf(y - x));
 540 |   else
 541 |     return y + log1pf(expf(x - y));
 542 | }
 543 | 
 544 | /* the kernel function for stepfwd */
 545 | __global__ void
 546 | stepfwdd(float *pre, float *transd, int *O, float *obvsd,
 547 |          int nstates, int nobvs, float *A)
 548 | {
 549 |   int bx = blockIdx.x;
 550 |   int by = blockIdx.y;
 551 | 
 552 |   int tx = threadIdx.x;
 553 |   int ty = threadIdx.y;
 554 | 
 555 |   int aBegin = nstates * BLOCK_SIZE * by;
 556 |   int aEnd = aBegin + nstates - 1;
 557 |   int aStep = BLOCK_SIZE;
 558 | 
 559 |   int bBegin = BLOCK_SIZE * bx;
 560 |   int bStep = BLOCK_SIZE * nstates;
 561 | 
 562 |   float sub = logf(0);
 563 | 
 564 |   int a, b, k;
 565 | 
 566 |   for (a = aBegin, b = bBegin;
 567 |        a <= aEnd;
 568 |        a += aStep, b += bStep) {
 569 | 
 570 |     __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 571 |     __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 572 |     __shared__ float Os[BLOCK_SIZE][BLOCK_SIZE];
 573 |     
 574 |     As[ty][tx] = pre[a + nstates * ty + tx];
 575 |     Bs[ty][tx] = transd[b + nstates * ty + tx];
 576 |     Os[ty][tx] = obvsd[IDX(BLOCK_SIZE * bx + tx, O[BLOCK_SIZE * by + ty], nobvs)];
 577 | 
 578 |     __syncthreads();
 579 | 
 580 |     for (k = 0; k < BLOCK_SIZE; ++k) {
 581 |       sub = logaddd(sub, As[ty][k] + Bs[k][tx] + Os[ty][tx]);
 582 |     }
 583 | 
 584 |     __syncthreads();
 585 |   }
 586 | 
 587 |   a = nstates * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 588 |   A[a + nstates * ty + tx] = sub;
 589 | }
 590 | 
 591 | /* compute one step (n-th) forward probability. the data partition
 592 |    follows matrix muptiplication A x B, where the previous forward
 593 |    probabilities are A and state transition probabilities are B. */
 594 | void stepfwd(float *alpha, size_t n)
 595 | {
 596 |   int size;
 597 |   int *Od;
 598 |   float *A;
 599 |   float *Ad;
 600 | 
 601 |   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 602 |   dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y);
 603 | 
 604 |   size = sizeof(int) * nseq;
 605 |   checkCudaErrors( cudaMalloc((void**)&Od, size) );
 606 |   checkCudaErrors( cudaMemcpy(Od, data + n * nseq, size, cudaMemcpyHostToDevice) );
 607 | 
 608 |   size = sizeof(float) * nstates * nseq;
 609 | 
 610 |   checkCudaErrors( cudaMalloc((void**)&A, size) );
 611 |   checkCudaErrors( cudaMemcpy(A, alpha + (n - 1) * nseq * nstates, size, cudaMemcpyHostToDevice) );
 612 | 
 613 |   checkCudaErrors( cudaMalloc((void**)&Ad, size) );
 614 | 
 615 | #ifdef PROFILE_GPU
 616 |   sdkCreateTimer( &gpu_timer );
 617 |   sdkStartTimer(&gpu_timer);
 618 | #endif
 619 | 
 620 |   stepfwdd<<<dimGrid, dimBlock>>>(A, transd, Od, obvsd, nstates, nobvs, Ad);
 621 | 
 622 | #ifdef PROFILE_GPU
 623 |   checkCudaErrors( cudaThreadSynchronize() );
 624 |   sdkStopTimer(&gpu_timer);
 625 |   gpu_time += sdkGetTimerValue(&gpu_timer);
 626 |   sdkDeleteTimer( &gpu_timer);
 627 | 
 628 |   gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * ((double)nstates) * 7;
 629 | #endif
 630 | 
 631 |   checkCudaErrors( cudaMemcpy(alpha + n * nseq * nstates, Ad, size, cudaMemcpyDeviceToHost) );
 632 | 
 633 |   checkCudaErrors( cudaFree(Od) );
 634 |   checkCudaErrors( cudaFree(A) );
 635 |   checkCudaErrors( cudaFree(Ad) );
 636 | }
 637 | 
 638 | /* init first slice of forwad probability matrix. */
 639 | void initfwd0(float *alpha)
 640 | {
 641 |   size_t i, j;
 642 |   for (i = 0; i < nseq; i++) {
 643 |     for (j = 0; j < nstates; j++) {
 644 |       alpha[IDX(i, j, nstates)] = prior[j] + obvs[IDX(j, data[i], nobvs)];
 645 |     }
 646 |   }
 647 | }
 648 | 
 649 | __global__ void
 650 | initfwdd(float *prior, int *Od, float *obvs,
 651 |          int nstates, int nobvs, float *Ad)
 652 | {
 653 |   int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
 654 |   int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 655 | 
 656 |   Ad[IDX(y, x, nstates)] = prior[x] + obvs[IDX(x, Od[y], nobvs)];
 657 | }
 658 | 
 659 | void initfwd(float *alpha)
 660 | {
 661 |   int size;
 662 |   int *Od;
 663 |   float *Ad;
 664 |   float *priord;
 665 | 
 666 |   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 667 |   dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y);
 668 | 
 669 |   size = sizeof(float) * nstates;
 670 |   checkCudaErrors( cudaMalloc((void**)&priord, size) );
 671 |   checkCudaErrors( cudaMemcpy(priord, prior, size, cudaMemcpyHostToDevice) );
 672 | 
 673 |   size = sizeof(int) * nseq;
 674 |   checkCudaErrors( cudaMalloc((void**)&Od, size) );
 675 |   checkCudaErrors( cudaMemcpy(Od, data, size, cudaMemcpyHostToDevice) );
 676 | 
 677 |   size = sizeof(float) * nstates * nseq;
 678 |   checkCudaErrors( cudaMalloc((void**)&Ad, size) );
 679 | 
 680 | 
 681 | #ifdef PROFILE_GPU
 682 |   sdkCreateTimer( &gpu_timer );
 683 |   sdkStartTimer(&gpu_timer);
 684 | #endif
 685 | 
 686 |   initfwdd<<<dimGrid, dimBlock>>>(priord, Od, obvsd, nstates, nobvs, Ad);
 687 | 
 688 | #ifdef PROFILE_GPU
 689 |   checkCudaErrors( cudaThreadSynchronize() );
 690 |   sdkStopTimer(&gpu_timer);
 691 |   gpu_time += sdkGetTimerValue(&gpu_timer);
 692 |   sdkDeleteTimer( &gpu_timer);
 693 | 
 694 |   gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * 1.0;
 695 | #endif
 696 | 
 697 |   checkCudaErrors( cudaMemcpy(alpha, Ad, size, cudaMemcpyDeviceToHost) );
 698 | 
 699 |   checkCudaErrors( cudaFree(Ad) );
 700 |   checkCudaErrors( cudaFree(priord) );
 701 | }
 702 | 
 703 | /* kernel function for initbck() */
 704 | __global__ void
 705 | initbckd(float *B, int nstates)
 706 | {
 707 |   int bx = blockIdx.x;
 708 |   int by = blockIdx.y;
 709 |   int tx = threadIdx.x;
 710 |   int ty = threadIdx.y;
 711 | 
 712 |   size_t i = bx * BLOCK_SIZE + tx;
 713 |   size_t j = by * BLOCK_SIZE + ty;
 714 | 
 715 |   B[IDX(j, i, nstates)] = 0;
 716 | }
 717 | 
 718 | /* initial backward probabilities for the last slice. data partition
 719 |    is the same as initfwd(). the values are all initilized to be 0. */
 720 | void initbck(float *beta)
 721 | {
 722 |   int size;
 723 |   float *Bd;
 724 | 
 725 |   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 726 |   dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y);
 727 | 
 728 |   size = sizeof(float) * nstates * nseq;
 729 |   checkCudaErrors( cudaMalloc((void**)&Bd, size) );
 730 | 
 731 |   initbckd<<<dimGrid, dimBlock>>>(Bd, nstates);
 732 | 
 733 |   checkCudaErrors( cudaMemcpy(beta, Bd, size, cudaMemcpyDeviceToHost) );
 734 | 
 735 |   checkCudaErrors( cudaFree(Bd) );
 736 | }
 737 | 
 738 | /* kernel function for updating xi counts */
 739 | __global__ void
 740 | updatexid(float *A, float *B, float *transd, int *O, float *obvsd,
 741 |           int nstates, int nobvs, int nseq, float* loglik, float *xid)
 742 | {
 743 |   int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 744 |   int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
 745 | 
 746 |   float e;
 747 |   int i;
 748 | 
 749 |   for (i = 0; i < nseq; i++) {
 750 |     e = A[IDX(i, y, nstates)] + transd[IDX(y, x, nstates)]
 751 |       + B[IDX(i, x, nstates)] + obvsd[IDX(x, O[i], nobvs)] - loglik[i];
 752 |     xid[IDX(y, x, nstates)] = logaddd(xid[IDX(y, x, nstates)], e);
 753 |   }
 754 | }
 755 | 
 756 | /* kernel function for updating gamma counts */
 757 | __global__ void
 758 | updategmmd(float *A, float *B, int *O, int nstates,
 759 |            int nobvs, int nseq, float *loglik, float *gmmd)
 760 | {
 761 |   int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 762 |   float e;
 763 |   int i = 0;
 764 | 
 765 |   for (i = 0; i < nseq; i++) {
 766 |     e = A[IDX(i, x, nstates)] + B[IDX(i, x, nstates)] - loglik[i];
 767 |     gmmd[IDX(x, O[i], nobvs)] = logaddd(gmmd[IDX(x, O[i], nobvs)], e);
 768 |   }
 769 | }
 770 | 
 771 | /* kernel function for updateing pi counts */
 772 | __global__ void
 773 | updatepid(float *A, float *B, int *O, int nstates,
 774 |           int nseq, float *loglik, float *pid)
 775 | {
 776 |   int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 777 |   float e;
 778 |   int i = 0;
 779 | 
 780 |   for (i = 0; i < nseq; i++) {
 781 |     e = A[IDX(i, x, nstates)] + B[IDX(i, x, nstates)] - loglik[i];
 782 |     pid[x] = logaddd(pid[x], e);
 783 |   }
 784 | }
 785 | 
 786 | /* kernel function for stepbck() */
 787 | __global__ void
 788 | stepbckd(float *pre, float *transd, int *O, float *obvsd,
 789 |          int nstates, int nobvs, float *B)
 790 | {
 791 |   int bx = blockIdx.x;
 792 |   int by = blockIdx.y;
 793 | 
 794 |   int tx = threadIdx.x;
 795 |   int ty = threadIdx.y;
 796 | 
 797 |   int aBegin = nstates * BLOCK_SIZE * by;
 798 |   int aEnd = aBegin + nstates - 1;
 799 |   int aStep = BLOCK_SIZE;
 800 | 
 801 |   int bBegin = nstates * BLOCK_SIZE * bx;
 802 |   int bStep = BLOCK_SIZE;
 803 | 
 804 |   float sub = logf(0);
 805 | 
 806 |   int a, b, k;
 807 | 
 808 |   for (a = aBegin, b = bBegin;
 809 |        a <= aEnd;
 810 |        a += aStep, b+= bStep) {
 811 | 
 812 |     __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 813 |     __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 814 | 
 815 |     As[ty][tx] = pre[a + IDX(ty, tx, nstates)] +
 816 |       obvsd[IDX(a - aBegin + tx, O[by * BLOCK_SIZE + ty], nobvs)];
 817 | 
 818 |     Bs[ty][tx] = transd[b + IDX(tx, ty, nstates)];
 819 | 
 820 |     __syncthreads();
 821 | 
 822 |     for (k = 0; k < BLOCK_SIZE; ++k) {
 823 |       sub = logaddd(sub, As[ty][k] + Bs[k][tx]);
 824 |     }
 825 | 
 826 |     __syncthreads();
 827 |   }
 828 | 
 829 |   b = nstates * BLOCK_SIZE * by + BLOCK_SIZE * bx;
 830 |   B[b + nstates * ty + tx] = sub;
 831 | }
 832 | 
 833 | /* compute one step backward probability and update counts.
 834 | 
 835 |    data partition for computing backward probilities follows forward
 836 |    pass. the result of the backward probabities are stored in the
 837 |    memory pointed by *beta. */
 838 | void stepbck(float *alpha, float *pre, size_t n, float* loglik, float *beta)
 839 | {
 840 |   int size;
 841 |   int *Od;
 842 | 
 843 |   float *Bd;
 844 |   float *pred;
 845 | 
 846 |   float *Ad;
 847 | 
 848 |   float *loglikd;
 849 | 
 850 |   dim3 xiBlock(BLOCK_SIZE, BLOCK_SIZE);
 851 |   dim3 xiGrid(nstates / xiBlock.x, nstates / xiBlock.y);
 852 | 
 853 |   dim3 gmmBlock(BLOCK_SIZE);
 854 |   dim3 gmmGrid(nstates / gmmBlock.x);
 855 | 
 856 |   dim3 sBlock(BLOCK_SIZE, BLOCK_SIZE);
 857 |   dim3 sGrid(nstates / sBlock.x, nseq / sBlock.y);
 858 | 
 859 |   size = sizeof(int) * nseq;
 860 |   checkCudaErrors( cudaMalloc((void**)&Od, size) );
 861 |   checkCudaErrors( cudaMemcpy(Od, data + (n + 1) * nseq,
 862 |                              size, cudaMemcpyHostToDevice) );
 863 | 
 864 |   checkCudaErrors( cudaMalloc((void**)&loglikd, size) );
 865 |   checkCudaErrors( cudaMemcpy(loglikd, loglik, size, cudaMemcpyHostToDevice) );
 866 | 
 867 |   size = sizeof(float) * nstates * nseq;
 868 | 
 869 |   checkCudaErrors( cudaMalloc((void**)&pred, size) );
 870 |   checkCudaErrors( cudaMemcpy(pred, pre, size, cudaMemcpyHostToDevice) );
 871 | 
 872 |   checkCudaErrors( cudaMalloc((void**)&Ad, size) );
 873 |   checkCudaErrors( cudaMemcpy(Ad, alpha + n * nseq * nstates,
 874 |                              size, cudaMemcpyHostToDevice) );
 875 | 
 876 |   /* update counts */
 877 | #ifdef PROFILE_GPU
 878 |   sdkCreateTimer( &gpu_timer );
 879 |   sdkStartTimer(&gpu_timer);
 880 | #endif
 881 |   updatexid<<<xiGrid, xiBlock>>>(Ad, pred, transd, Od, obvsd,
 882 |                                  nstates, nobvs, nseq, loglikd, xid);
 883 | 
 884 | #ifdef PROFILE_GPU
 885 |   checkCudaErrors( cudaThreadSynchronize() );
 886 |   sdkStopTimer(&gpu_timer);
 887 |   gpu_time += sdkGetTimerValue(&gpu_timer);
 888 |   sdkDeleteTimer( &gpu_timer);
 889 | 
 890 |   gpu_flop += 1e-6 * ((double) nstates) * ((double) nstates) * ((double) nseq) * 9.0;
 891 | #endif
 892 | 
 893 | 
 894 |   checkCudaErrors( cudaMemcpy(Ad, alpha + (n + 1) * nseq * nstates,
 895 |                              size, cudaMemcpyHostToDevice) );
 896 | 
 897 | #ifdef PROFILE_GPU
 898 |   sdkCreateTimer( &gpu_timer );
 899 |   sdkStartTimer(&gpu_timer);
 900 | #endif
 901 | 
 902 |   updategmmd<<<gmmGrid, gmmBlock>>>(Ad, pred, Od, nstates, nobvs, nseq, loglikd, gmmd);
 903 | 
 904 | #ifdef PROFILE_GPU
 905 |   checkCudaErrors( cudaThreadSynchronize() );
 906 |   sdkStopTimer(&gpu_timer);
 907 |   gpu_time += sdkGetTimerValue(&gpu_timer);
 908 |   sdkDeleteTimer( &gpu_timer);
 909 | 
 910 |   gpu_flop += 1e-6 * ((double) nstates) * ((double) nseq) * 7.0;
 911 | #endif
 912 | 
 913 | 
 914 |   /* compute one step beta probabilities */
 915 |   checkCudaErrors( cudaMalloc((void**)&Bd, size) );
 916 | 
 917 | #ifdef PROFILE_GPU
 918 |   sdkCreateTimer( &gpu_timer );
 919 |   sdkStartTimer(&gpu_timer);
 920 | #endif
 921 | 
 922 |   stepbckd<<<sGrid, sBlock>>>(pred, transd, Od, obvsd, nstates, nobvs, Bd);
 923 | 
 924 | #ifdef PROFILE_GPU
 925 |   checkCudaErrors( cudaThreadSynchronize() );
 926 |   sdkStopTimer(&gpu_timer);
 927 |   gpu_time += sdkGetTimerValue(&gpu_timer);
 928 |   sdkDeleteTimer( &gpu_timer);
 929 | 
 930 |   gpu_flop += 1e-6 * ((double) nstates) * ((double) nseq) * ((double) nstates) * 6.0;
 931 | #endif
 932 | 
 933 | 
 934 |   checkCudaErrors( cudaMemcpy(beta, Bd, size, cudaMemcpyDeviceToHost) );
 935 | 
 936 |   checkCudaErrors( cudaFree(Ad) );
 937 |   checkCudaErrors( cudaFree(Od) );
 938 |   checkCudaErrors( cudaFree(Bd) );
 939 |   checkCudaErrors( cudaFree(pred) );
 940 |   checkCudaErrors( cudaFree(loglikd) );
 941 | }
 942 | 
 943 | void last_update(float *alpha, float *beta, float *loglik)
 944 | {
 945 |   int size;
 946 |   int *Od;
 947 |   float *Bd, *Ad, *loglikd;
 948 | 
 949 |   dim3 gmmBlock(BLOCK_SIZE);
 950 |   dim3 gmmGrid(nstates / gmmBlock.x);
 951 | 
 952 |   dim3 piBlock(BLOCK_SIZE);
 953 |   dim3 piGrid(nstates / piBlock.x);
 954 | 
 955 |   size = sizeof(int) * nseq;
 956 |   checkCudaErrors( cudaMalloc((void**)&Od, size) );
 957 |   checkCudaErrors( cudaMemcpy(Od, data, size, cudaMemcpyHostToDevice) );
 958 | 
 959 |   checkCudaErrors( cudaMalloc((void**)&loglikd, size) );
 960 |   checkCudaErrors( cudaMemcpy(loglikd, loglik, size, cudaMemcpyHostToDevice) );
 961 | 
 962 |   size = sizeof(float) * nstates * nseq;
 963 |   checkCudaErrors( cudaMalloc((void**)&Bd, size) );
 964 |   checkCudaErrors( cudaMalloc((void**)&Ad, size) );
 965 |   checkCudaErrors( cudaMemcpy(Bd, beta, size, cudaMemcpyHostToDevice) );
 966 |   checkCudaErrors( cudaMemcpy(Ad, alpha, size, cudaMemcpyHostToDevice) );
 967 | 
 968 | 
 969 | #ifdef PROFILE_GPU
 970 |   sdkCreateTimer( &gpu_timer );
 971 |   sdkStartTimer(&gpu_timer);
 972 | #endif
 973 | 
 974 |   updategmmd<<<gmmGrid, gmmBlock>>>(Ad, Bd, Od, nstates, nobvs, nseq, loglikd, gmmd);
 975 |   updatepid<<<piGrid, piBlock>>>(Ad, Bd, Od, nstates, nseq, loglikd, pid);
 976 | 
 977 | #ifdef PROFILE_GPU
 978 |   checkCudaErrors( cudaThreadSynchronize() );
 979 |   sdkStopTimer(&gpu_timer);
 980 |   gpu_time += sdkGetTimerValue(&gpu_timer);
 981 |   sdkDeleteTimer( &gpu_timer);
 982 | 
 983 |   gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * 7.0;
 984 |   gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * 7.0;
 985 | #endif
 986 | 
 987 |   checkCudaErrors( cudaFree(Ad) );
 988 |   checkCudaErrors( cudaFree(Bd) );
 989 |   checkCudaErrors( cudaFree(Od) );
 990 |   checkCudaErrors( cudaFree(loglikd) );
 991 | }
 992 | 
 993 | /* forwad backward algorithm: running on all sequences in parallel */
 994 | float forward_backward(int backward)
 995 | {
 996 |   float *alpha = NULL;
 997 |   float *beta = NULL;
 998 |   float *prebeta = NULL;
 999 |   size_t i;
1000 |   size_t a;
1001 |   float *loglik = NULL;
1002 |   float p;
1003 |   int size;
1004 |   float *bckll = NULL;
1005 | 
1006 |   /* initial alpha probabilities for all data sequences
1007 |      (LARGEST memory allocaltion in the entire program) */
1008 |   alpha = (float *) malloc(sizeof(float) * length * nstates * nseq);
1009 |   if (alpha == NULL) handle_error("malloc");
1010 | 
1011 |   initfwd(alpha);
1012 | 
1013 |   /* forward pass */
1014 |   for (i = 1; i < length; i++) {
1015 |     stepfwd(alpha, i);
1016 |   }
1017 | 
1018 |   loglik = (float *) malloc(sizeof(float) * nseq);
1019 |   if (loglik == NULL) handle_error("malloc");
1020 |   for (i = 0, a = (length - 1) * nseq * nstates;
1021 |        i < nseq; i++) {
1022 |     loglik[i] = logsumf(alpha + a + i * nstates, nstates, IN_HOST);
1023 |   }
1024 |   p = sumf(loglik, nseq, IN_HOST);
1025 | 
1026 |   if (! backward) {
1027 | #ifdef PROFILE_PG
1028 |     sdkStopTimer(&pg_timer);
1029 | #endif
1030 |     for (i = 0; i < nseq; i++) {
1031 |       printf("%.4f\n", loglik[i]);
1032 |     }
1033 |     printf("total: %.4f\n", p);
1034 | #ifdef PROFILE_PG
1035 |     sdkStartTimer(&pg_timer);
1036 | #endif
1037 |     if (loglik) free(loglik);
1038 |     if (alpha) free(alpha);
1039 |     return p;
1040 |   }
1041 | 
1042 |   /* initial backward probabilities */
1043 |   size = sizeof(float) * nstates * nseq;
1044 |   beta = (float *) malloc(size);
1045 |   if (beta == NULL) handle_error("malloc");
1046 |   prebeta = (float *) malloc(size);
1047 |   if (prebeta == NULL) handle_error("malloc");
1048 | 
1049 |   initbck(prebeta);
1050 | 
1051 |   /* backward pass & update counts at last step */
1052 |   for (i = 1; i < length; i++) {
1053 |     stepbck(alpha, prebeta, length - 1 - i, loglik, beta);
1054 |     memmove(prebeta, beta, size);
1055 |   }
1056 | 
1057 |   /* update first slice of data */
1058 |   last_update(alpha, prebeta, loglik);
1059 | 
1060 | #ifdef DEBUG
1061 |   /* compute backward prob for verification purpose */
1062 |   bckll = (float *) malloc(sizeof(float) * nseq);
1063 |   if (bckll == NULL) handle_error("malloc");
1064 |   for (i = 0; i < nseq; i++) {
1065 |     p = - INFINITY;
1066 |     for (j = 0; j < nstates; j++) {
1067 |       p = logadd(p, prior[j] + beta[IDX(i,j,nstates)] + obvs[IDX(j, data[i], nobvs)]);
1068 |     }
1069 |     bckll[i] = p;
1070 |   }
1071 |   p = sumf(bckll, nseq, IN_HOST);
1072 | 
1073 |   for (i = 0; i < nseq; i++)
1074 |     if (fabs(bckll[i] - loglik[i]) > 1e-3)
1075 |       fprintf(stderr, "Error: forward and backward incompatible: %f, %f\n",
1076 |               loglik[i], bckll[i]);
1077 | #endif
1078 | 
1079 |   if (alpha) free(alpha);
1080 |   if (beta) free(beta);
1081 |   if (prebeta) free(prebeta);
1082 |   if (loglik) free(loglik);
1083 |   if (bckll) free(bckll);
1084 | 
1085 |   return p;
1086 | }
1087 | 
1088 | 
1089 | /* the kernel function for viterbi algorithm */
1090 | __global__ void
1091 | viterbi_fwdd(float *prelbd, float *transd, int *O, float *obvsd,
1092 |              int nstates, int nobvs, float *lambda, int *backtrace)
1093 | {
1094 |   int bx = blockIdx.x;
1095 |   int by = blockIdx.y;
1096 | 
1097 |   int tx = threadIdx.x;
1098 |   int ty = threadIdx.y;
1099 | 
1100 |   int aBegin = nstates * BLOCK_SIZE * by;
1101 |   int aEnd = aBegin + nstates - 1;
1102 |   int aStep = BLOCK_SIZE;
1103 | 
1104 |   int bBegin = BLOCK_SIZE * bx;
1105 |   int bStep = BLOCK_SIZE * nstates;
1106 | 
1107 |   float sub = logf(0);
1108 |   float p;
1109 | 
1110 |   int a, b, k, i;
1111 | 
1112 |   for (a = aBegin, b = bBegin;
1113 |        a <= aEnd;
1114 |        a += aStep, b += bStep) {
1115 | 
1116 |     __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
1117 |     __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
1118 |     __shared__ float Os[BLOCK_SIZE][BLOCK_SIZE];
1119 |     
1120 |     As[ty][tx] = prelbd[a + nstates * ty + tx];
1121 |     Bs[ty][tx] = transd[b + nstates * ty + tx];
1122 |     Os[ty][tx] = obvsd[IDX(BLOCK_SIZE * bx + tx, O[BLOCK_SIZE * by + ty], nobvs)];
1123 | 
1124 |     __syncthreads();
1125 | 
1126 |     for (k = 0; k < BLOCK_SIZE; ++k) {
1127 |       p =  As[ty][k] + Bs[k][tx] + Os[ty][tx];
1128 |       if (p > sub) {
1129 |         sub = p;
1130 |         i = a + k - aBegin;
1131 |       }
1132 |     }
1133 | 
1134 |     __syncthreads();
1135 |   }
1136 | 
1137 |   a = nstates * BLOCK_SIZE * by + BLOCK_SIZE * bx;
1138 |   lambda[a + nstates * ty + tx] = sub;
1139 |   backtrace[a + nstates * ty + tx] = i;
1140 | }
1141 | 
1142 | void viterbi_fwd(float *prelbd, size_t n, float *lambda, int *backtrace)
1143 | {
1144 |   int size;
1145 |   int *Od;
1146 |   float *pred;
1147 |   float *lbdd;
1148 |   int *Bd;
1149 | 
1150 |   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
1151 |   dim3 dimGrid(nstates / dimBlock.x, nseq / dimBlock.y);
1152 | 
1153 |   size = sizeof(int) * nseq;
1154 |   checkCudaErrors( cudaMalloc((void**)&Od, size) );
1155 |   checkCudaErrors( cudaMemcpy(Od, data + n * nseq, size, cudaMemcpyHostToDevice) );
1156 | 
1157 |   size = sizeof(float) * nstates * nseq;
1158 | 
1159 |   checkCudaErrors( cudaMalloc((void**)&pred, size) );
1160 |   checkCudaErrors( cudaMemcpy(pred, prelbd, size, cudaMemcpyHostToDevice) );
1161 | 
1162 |   checkCudaErrors( cudaMalloc((void**)&lbdd, size) );
1163 | 
1164 |   size = sizeof(int) * nstates * nseq;
1165 |   checkCudaErrors( cudaMalloc((void**)&Bd, size) );
1166 | 
1167 | 
1168 | #ifdef PROFILE_GPU
1169 |   sdkCreateTimer( &gpu_timer );
1170 |   sdkStartTimer(&gpu_timer);
1171 | #endif
1172 | 
1173 |   viterbi_fwdd<<<dimGrid, dimBlock>>>(pred, transd, Od, obvsd, nstates, nobvs, lbdd, Bd);
1174 | 
1175 | #ifdef PROFILE_GPU
1176 |   checkCudaErrors( cudaThreadSynchronize() );
1177 |   sdkStopTimer(&gpu_timer);
1178 |   gpu_time += sdkGetTimerValue(&gpu_timer);
1179 |   sdkDeleteTimer( &gpu_timer);
1180 | 
1181 |   gpu_flop += 1e-6 * ((double)nstates) * ((double)nseq) * ((double)nstates) * 3;
1182 | #endif
1183 | 
1184 |   size = sizeof(float) * nstates * nseq;
1185 |   checkCudaErrors( cudaMemcpy(lambda, lbdd, size, cudaMemcpyDeviceToHost) );
1186 |   size = sizeof(int) * nstates * nseq;
1187 |   checkCudaErrors( cudaMemcpy(backtrace + n * nstates * nseq, Bd, size, cudaMemcpyDeviceToHost) );
1188 | 
1189 |   checkCudaErrors( cudaFree(Od) );
1190 |   checkCudaErrors( cudaFree(pred) );
1191 |   checkCudaErrors( cudaFree(lbdd) );
1192 |   checkCudaErrors( cudaFree(Bd) );
1193 | }
1194 | 
1195 | __global__ void
1196 | fltpd(float *lbdd, size_t nstates, int *stackd)
1197 | {
1198 |   int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
1199 |   size_t i, besti;
1200 |   float max;
1201 | 
1202 |   for (i = 0; i < nstates; i++) {
1203 |     if (i == 0 || max < lbdd[IDX(x, i, nstates)]) {
1204 |       max = lbdd[IDX(x, i, nstates)];
1205 |       besti = i;
1206 |     }
1207 |   }
1208 |   stackd[x] = besti;
1209 | }
1210 | 
1211 | void find_last_trace_points(float *lambda, int *stack)
1212 | {
1213 |   float *lbdd;
1214 |   int *stackd;
1215 |   int size;
1216 |   dim3 dimBlock(BLOCK_SIZE);
1217 |   dim3 dimGrid(nseq / dimBlock.x);
1218 |  
1219 |   size = sizeof(float) * nstates * nseq;
1220 |   checkCudaErrors( cudaMalloc((void**)&lbdd, size) );
1221 |   checkCudaErrors( cudaMemcpy(lbdd, lambda, size, cudaMemcpyHostToDevice) );
1222 | 
1223 |   size = sizeof(int) * nseq;
1224 |   checkCudaErrors( cudaMalloc((void**)&stackd, size) );
1225 | 
1226 |   fltpd<<<dimGrid, dimBlock>>>(lbdd, nstates, stackd);
1227 | 
1228 |   checkCudaErrors( cudaMemcpy(stack, stackd, size, cudaMemcpyDeviceToHost) );
1229 | 
1230 |   checkCudaErrors( cudaFree(lbdd) );
1231 |   checkCudaErrors( cudaFree(stackd) );
1232 | }
1233 | 
1234 | __global__ void
1235 | backtraced(int *pre, int *bckpd, int nstates, int *stackd)
1236 | {
1237 |   int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
1238 |   stackd[x] = bckpd[IDX(x, pre[x], nstates)];
1239 | }
1240 | 
1241 | void backtrace(int *backtracep, int *stack, int n)
1242 | {
1243 |   int *bckpd;
1244 |   int *stackd;
1245 |   int *pred;
1246 |   int size;
1247 | 
1248 |   dim3 dimBlock(BLOCK_SIZE);
1249 |   dim3 dimGrid(nseq / dimBlock.x);
1250 | 
1251 |   size = sizeof(int) * nseq * nstates;
1252 |   checkCudaErrors( cudaMalloc((void**)&bckpd, size) );
1253 |   checkCudaErrors( cudaMemcpy(bckpd, backtracep + (n + 1) * nseq * nstates, size, cudaMemcpyHostToDevice) );
1254 | 
1255 |   size = sizeof(int) * nseq;
1256 |   checkCudaErrors( cudaMalloc((void**)&pred, size) );
1257 |   checkCudaErrors( cudaMemcpy(pred, stack + (n + 1) * nseq, size, cudaMemcpyHostToDevice) );
1258 | 
1259 |   checkCudaErrors( cudaMalloc((void**)&stackd, size) );
1260 | 
1261 |   backtraced<<<dimGrid, dimBlock>>>(pred, bckpd, nstates, stackd);
1262 | 
1263 |   checkCudaErrors( cudaMemcpy(stack + n * nseq, stackd, size, cudaMemcpyDeviceToHost) );
1264 | 
1265 |   checkCudaErrors( cudaFree(bckpd) );
1266 |   checkCudaErrors( cudaFree(stackd) );
1267 |   checkCudaErrors( cudaFree(pred) );
1268 | }
1269 | 
1270 | void print_path(int *stack)
1271 | {
1272 |   size_t i, j;
1273 |   for (i  = 0; i < nseq; i++) {
1274 |     for (j = 0; j < length; j++) {
1275 |       printf("%d ", stack[IDX(j, i, nseq)]);
1276 |     }
1277 |     printf("\n");
1278 |   }
1279 | }
1280 | 
1281 | void viterbi()
1282 | {
1283 |   float *lambda = NULL;
1284 |   float *prelbd = NULL;
1285 |   int *backtracep = NULL;
1286 |   int *stack = NULL;
1287 |   int size;
1288 |   size_t i;
1289 | 
1290 |   backtracep = (int *) malloc(sizeof(float) * length * nstates * nseq);
1291 |   if (backtracep == NULL) handle_error("malloc");
1292 | 
1293 |   size = sizeof(float) * nstates * nseq;
1294 | 
1295 |   lambda = (float *) malloc(size);
1296 |   if (lambda == NULL) handle_error("malloc");
1297 | 
1298 |   prelbd = (float *) malloc(size);
1299 |   if (prelbd == NULL) handle_error("malloc");
1300 |   
1301 |   initfwd(prelbd);
1302 | 
1303 |   for (i = 1; i < length; i++) {
1304 |     viterbi_fwd(prelbd, i, lambda, backtracep);
1305 |     memmove(prelbd, lambda, size);
1306 |   }
1307 | 
1308 |   stack = (int*) malloc(sizeof(int) * nseq * length);
1309 |   if (stack == NULL) handle_error("malloc");
1310 | 
1311 |   find_last_trace_points(lambda, stack + (length - 1) * nseq);
1312 |   for (i = 1; i < length; i++) {
1313 |     backtrace(backtracep, stack, length - 1 - i);
1314 |   }
1315 | 
1316 |   print_path(stack);
1317 | 
1318 |   free(lambda);
1319 |   free(prelbd);
1320 |   free(backtracep);
1321 |   free(stack);
1322 | }
1323 | 
1324 | /* update model parameters using estimated counts */
1325 | void update_prob()
1326 | {
1327 |   float pisum;
1328 |   float gmmsum[nstates];
1329 |   float xisum[nstates];
1330 |   float pi[nstates];
1331 |   
1332 |   float gmm[nstates * nobvs];
1333 |   float xi[nstates * nstates];
1334 | 
1335 |   size_t i, j;
1336 | 
1337 |   checkCudaErrors( cudaMemcpy(xi, xid, nstates * nstates * sizeof(float), cudaMemcpyDeviceToHost) );
1338 |   checkCudaErrors( cudaMemcpy(gmm, gmmd, nobvs * nstates * sizeof(float), cudaMemcpyDeviceToHost) );
1339 |   checkCudaErrors( cudaMemcpy(pi, pid, nstates * sizeof(float), cudaMemcpyDeviceToHost) );
1340 | 
1341 |   if (gmmd) checkCudaErrors( cudaFree(gmmd) );
1342 |   if (xid) checkCudaErrors( cudaFree(xid) );
1343 |   if (pid) checkCudaErrors( cudaFree(pid) );
1344 | 
1345 |   pisum = logsumf(pi, nstates, IN_HOST);
1346 |   for (i = 0; i < nstates; i++) {
1347 |     gmmsum[i] = logsumf(gmm + i * nobvs, nobvs, IN_HOST);
1348 |     xisum[i] = logsumf(xi + i * nstates, nstates, IN_HOST);
1349 |   }
1350 | 
1351 |   for (i = 0; i < nstates; i++) {
1352 |     prior[i] = pi[i] - pisum;
1353 |   }
1354 | 
1355 |   for (i = 0; i < nstates; i++) {
1356 |     for (j = 0; j < nstates; j++) {
1357 |       trans[IDX(i,j,nstates)] = xi[IDX(i,j,nstates)] - xisum[i];
1358 |     }
1359 |     for (j = 0; j < nobvs; j++) {
1360 |       obvs[IDX(i,j,nobvs)] = gmm[IDX(i,j,nobvs)] - gmmsum[i];
1361 |     }
1362 |   }
1363 | 
1364 |   /* update indevice parameters */
1365 |   checkCudaErrors( cudaMemcpy(transd, trans, nstates * nstates * sizeof(float), cudaMemcpyHostToDevice) );
1366 |   checkCudaErrors( cudaMemcpy(obvsd, obvs, nobvs * nstates * sizeof(float), cudaMemcpyHostToDevice) );
1367 | }
1368 | 
1369 | void usage() {
1370 |   fprintf(stdout, "chmm [-hnt] [-c config] [-p(1|2|3)]\n");
1371 |   fprintf(stdout, "usage:\n");
1372 |   fprintf(stdout, "  -h   help\n");
1373 |   fprintf(stdout, "  -c   configuration file\n");
1374 |   fprintf(stdout, "  -t   output computation time\n");
1375 |   fprintf(stdout, "  -p1  compute the probability of the observation sequence\n");
1376 |   fprintf(stdout, "  -p2  compute the most probable sequence (Viterbi)\n");
1377 |   fprintf(stdout, "  -p3  train hidden Markov mode parameters (Baum-Welch)\n");
1378 |   fprintf(stdout, "  -n   number of iterations\n");
1379 | }
1380 | 
1381 | /* free all memory */
1382 | void freeall() {
1383 | 
1384 |   if (transd) checkCudaErrors( cudaFree(transd) );
1385 |   if (obvsd) checkCudaErrors( cudaFree(obvsd) );
1386 | 
1387 |   if (trans) free(trans);
1388 |   if (obvs) free(obvs);
1389 |   if (prior) free(prior);
1390 |   if (data) free(data);
1391 | }
1392 | 


--------------------------------------------------------------------------------