├── Makefile ├── README.md ├── kernel.cl ├── main.c ├── timer.c └── timer.h /Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=-I/usr/local/cuda/include 2 | CFLAGS=-std=gnu99 -g -Wall 3 | LDLIBS=-lOpenCL 4 | 5 | all: main 6 | 7 | main: timer.o 8 | 9 | clean: 10 | rm -rf *.o main 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Winograd-OpenCL 2 | -------------------------------------------------------------------------------- /kernel.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * inputs dim = (N, C, H, W) 3 | * outputs dim = (C, H, W, N) 4 | * global_work_size = (ceil(N * C * H * W, 256)) 5 | * local_work_size = (256) 6 | */ 7 | __kernel void NCHW2CHWN( 8 | __global float *inputs, 9 | __global float *outputs, 10 | int N, int C, int H, int W 11 | ) { 12 | int gid0 = get_global_id(0); 13 | int n = gid0 / (C * H * W); 14 | if (n < N) { 15 | int chw = gid0 - n * (C * H * W); 16 | outputs[chw * N + n] = inputs[gid0]; 17 | } 18 | } 19 | 20 | /* 21 | * inputs dim = (C, H, W, N) 22 | * outputs dim = (N, C, H, W) 23 | * global_work_size = (ceil(N * C * H * W, 256)) 24 | * local_work_size = (256) 25 | */ 26 | __kernel void CHWN2NCHW( 27 | __global float *inputs, 28 | __global float *outputs, 29 | int N, int C, int H, int W 30 | ) { 31 | int gid0 = get_global_id(0); 32 | int n = gid0 / (C * H * W); 33 | if (n < N) { 34 | int chw = gid0 - n * (C * H * W); 35 | outputs[gid0] = inputs[chw * N + n]; 36 | } 37 | } 38 | 39 | /* 40 | * inputs dim = (C, H, W, N) 41 | * outputs dim = (K, P, Q, N) 42 | * filters dim = (C, 3, 3, K) 43 | * bias dim = (K) 44 | * global_work_size = (TP * TQ * BN * BK * 256, TK / BK, TN / BN) 45 | * local_work_size = (256) 46 | */ 47 | __kernel void winograd_2x2_3x3_16x16( 48 | __global float *inputs, 49 | __global float *outputs, 50 | __global float *filters, 51 | __global float *bias, 52 | int N, 53 | int C, int H, int W, 54 | int K, int P, int Q, 55 | int pad, 56 | int TP, int TQ, int BN, int BK, 57 | int TPmask, int TPwidth, int TPshift, 58 | int TQmask, int TQwidth, int TQshift, 59 | int Nmask, int Nwidth 60 | ) { 61 | int tptqbnbk = get_group_id(0); 62 | int tp = tptqbnbk / (TQ * BN * BK); 63 | int tqbnbk = tptqbnbk - tp * (TQ * BN * BK); 64 | int tq = tqbnbk / (BN * BK); 65 | int bnbk = tqbnbk - tq * (BN * BK); 66 | int bn = bnbk / (BK); 67 | int bk = bnbk - bn * (BK); 68 | 69 | int tid = get_local_id(0); 70 | int tidlow = tid & 15; 71 | int c = (tid & 0x70) >> 4; 72 | int ci = c - (C & 7 ? 8 - (C & 7) : 0); 73 | tp = (tp << TPwidth) + ((tid & TPmask) >> TPshift); 74 | tq = (tq << TQwidth) + ((tid & TQmask) >> TQshift); 75 | int h = (tp << 1) - pad, w = (tq << 1) - pad; 76 | int n = ((get_group_id(2) * BN + bn) << Nwidth) + (tid & Nmask); 77 | int k = ((get_group_id(1) * BK + bk) << 4) + tidlow; 78 | 79 | __local float SM[2 * 8 * 16 * 16]; 80 | __local float *pRSV = SM + (tid & 0xf0) + (tid & 0x3); 81 | __local float *pRSU = SM + 8 * 16 * 16 + (tid & 0xf0) + ((tid & 0xc) >> 2); 82 | 83 | float r[4][4], rA[4], rB[4]; 84 | for (int i = 0; i < 4; ++i) { 85 | for (int j = 0; j < 4; ++j) { 86 | r[i][j] = 0; 87 | } 88 | } 89 | 90 | if (tid < 128) { // image transform 91 | float v[4][4], TV[4][4], V[4][4]; 92 | 93 | bool preds[4][4]; 94 | for (int i = 0; i < 4; ++i) { 95 | for (int j = 0; j < 4; ++j) { 96 | preds[i][j] = n < N && 0 <= h + i && h + i < H && 0 <= w + j && w + j < W; 97 | } 98 | } 99 | 100 | __global float *pV = inputs + ((ci * H + h) * W + w) * N + n; 101 | for (int i = 0; i < 4; ++i) { 102 | for (int j = 0; j < 4; ++j) { 103 | v[i][j] = ci >= 0 && preds[i][j] ? pV[(i * W + j) * N] : 0; 104 | } 105 | } 106 | 107 | __local float *pWSV = SM + c * 16 * 16 + tidlow; 108 | while (true) { 109 | TV[0][0] = v[0][0] - v[2][0]; 110 | TV[0][1] = v[0][1] - v[2][1]; 111 | TV[0][2] = v[0][2] - v[2][2]; 112 | TV[0][3] = v[0][3] - v[2][3]; 113 | 114 | TV[3][0] = v[1][0] - v[3][0]; 115 | TV[3][1] = v[1][1] - v[3][1]; 116 | TV[3][2] = v[1][2] - v[3][2]; 117 | TV[3][3] = v[1][3] - v[3][3]; 118 | 119 | TV[1][0] = v[1][0] + v[2][0]; 120 | TV[1][1] = v[1][1] + v[2][1]; 121 | TV[1][2] = v[1][2] + v[2][2]; 122 | TV[1][3] = v[1][3] + v[2][3]; 123 | 124 | TV[2][0] = v[2][0] - v[1][0]; 125 | TV[2][1] = v[2][1] - v[1][1]; 126 | TV[2][2] = v[2][2] - v[1][2]; 127 | TV[2][3] = v[2][3] - v[1][3]; 128 | 129 | V[0][0] = TV[0][0] - TV[0][2]; 130 | V[0][3] = TV[0][1] - TV[0][3]; 131 | V[3][0] = TV[3][0] - TV[3][2]; 132 | V[3][3] = TV[3][1] - TV[3][3]; 133 | 134 | V[1][0] = TV[1][0] - TV[1][2]; 135 | V[2][0] = TV[2][0] - TV[2][2]; 136 | V[1][3] = TV[1][1] - TV[1][3]; 137 | V[2][3] = TV[2][1] - TV[2][3]; 138 | 139 | V[2][1] = TV[2][1] + TV[2][2]; 140 | V[2][2] = TV[2][2] - TV[2][1]; 141 | 142 | V[0][1] = TV[0][1] + TV[0][2]; 143 | V[0][2] = TV[0][2] - TV[0][1]; 144 | V[1][1] = TV[1][1] + TV[1][2]; 145 | V[1][2] = TV[1][2] - TV[1][1]; 146 | V[3][1] = TV[3][1] + TV[3][2]; 147 | V[3][2] = TV[3][2] - TV[3][1]; 148 | 149 | for (int i = 0; i < 4; ++i) { 150 | for (int j = 0; j < 4; ++j) { 151 | pWSV[(i * 4 + j) * 16] = V[i][j]; 152 | } 153 | } 154 | 155 | barrier(CLK_LOCAL_MEM_FENCE); 156 | 157 | for (int l = 0; l < 8; ++l) { 158 | for (int i = 0; i < 4; ++i) { 159 | rA[i] = pRSU[l * 16 * 16 + i * 4]; 160 | rB[i] = pRSV[l * 16 * 16 + i * 4]; 161 | } 162 | for (int i = 0; i < 4; ++i) { 163 | for (int j = 0; j < 4; ++j) { 164 | r[i][j] += rA[i] * rB[j]; 165 | } 166 | } 167 | } 168 | 169 | barrier(CLK_LOCAL_MEM_FENCE); 170 | 171 | ci += 8; 172 | if (ci >= C) break; 173 | pV += 8 * H * W * N; 174 | 175 | for (int i = 0; i < 4; ++i) { 176 | for (int j = 0; j < 4; ++j) { 177 | v[i][j] = preds[i][j] ? pV[(i * W + j) * N] : 0; 178 | } 179 | } 180 | } 181 | } else { // filter transform 182 | float u[3][3], TU[4][3], TA[3], TB[4], U[4][4]; 183 | 184 | bool pred = k < K; 185 | 186 | __global float *pU = filters + ci * 3 * 3 * K + k; 187 | for (int i = 0; i < 3; ++i) { 188 | for (int j = 0; j < 3; ++j) { 189 | u[i][j] = ci >= 0 && pred ? pU[(i * 3 + j) * K] : 0; 190 | } 191 | } 192 | 193 | __local float *pWSU = SM + (c + 8) * 16 * 16 + tidlow; 194 | while (true) { 195 | TA[0] = (u[0][0] + u[2][0]) * 0.5; 196 | TA[1] = (u[0][1] + u[2][1]) * 0.5; 197 | TA[2] = (u[0][2] + u[2][2]) * 0.5; 198 | TU[0][0] = u[0][0]; 199 | TU[0][1] = u[0][1]; 200 | TU[0][2] = u[0][2]; 201 | TU[3][0] = u[2][0]; 202 | TU[3][1] = u[2][1]; 203 | TU[3][2] = u[2][2]; 204 | TU[1][0] = TA[0] + u[1][0] * 0.5; 205 | TU[2][0] = TA[0] - u[1][0] * 0.5; 206 | TU[1][1] = TA[1] + u[1][1] * 0.5; 207 | TU[2][1] = TA[1] - u[1][1] * 0.5; 208 | TU[1][2] = TA[2] + u[1][2] * 0.5; 209 | TU[2][2] = TA[2] - u[1][2] * 0.5; 210 | TB[0] = (TU[0][0] + TU[0][2]) * 0.5; 211 | TB[1] = (TU[1][0] + TU[1][2]) * 0.5; 212 | TB[2] = (TU[2][0] + TU[2][2]) * 0.5; 213 | TB[3] = (TU[3][0] + TU[3][2]) * 0.5; 214 | U[0][0] = TU[0][0]; 215 | U[0][3] = TU[0][2]; 216 | U[3][0] = TU[3][0]; 217 | U[3][3] = TU[3][2]; 218 | U[1][0] = TU[1][0]; 219 | U[2][0] = TU[2][0]; 220 | U[1][3] = TU[1][2]; 221 | U[2][3] = TU[2][2]; 222 | U[1][1] = TB[1] + TU[1][1] * 0.5; 223 | U[1][2] = TB[1] - TU[1][1] * 0.5; 224 | U[2][1] = TB[2] + TU[2][1] * 0.5; 225 | U[2][2] = TB[2] - TU[2][1] * 0.5; 226 | U[0][1] = TB[0] + TU[0][1] * 0.5; 227 | U[0][2] = TB[0] - TU[0][1] * 0.5; 228 | U[3][1] = TB[3] + TU[3][1] * 0.5; 229 | U[3][2] = TB[3] - TU[3][1] * 0.5; 230 | 231 | for (int i = 0; i < 4; ++i) { 232 | for (int j = 0; j < 4; ++j) { 233 | pWSU[(i * 4 + j) * 16] = U[i][j]; 234 | } 235 | } 236 | 237 | barrier(CLK_LOCAL_MEM_FENCE); 238 | 239 | for (int l = 0; l < 8; ++l) { 240 | for (int i = 0; i < 4; ++i) { 241 | rA[i] = pRSU[l * 16 * 16 + i * 4]; 242 | rB[i] = pRSV[l * 16 * 16 + i * 4]; 243 | } 244 | for (int i = 0; i < 4; ++i) { 245 | for (int j = 0; j < 4; ++j) { 246 | r[i][j] += rA[i] * rB[j]; 247 | } 248 | } 249 | } 250 | 251 | barrier(CLK_LOCAL_MEM_FENCE); 252 | 253 | ci += 8; 254 | if (ci >= C) break; 255 | pU += 8 * 3 * 3 * K; 256 | 257 | for (int i = 0; i < 3; ++i) { 258 | for (int j = 0; j < 3; ++j) { 259 | u[i][j] = pred ? pU[(i * 3 + j) * K] : 0; 260 | } 261 | } 262 | } 263 | } 264 | 265 | // inverse transform 266 | { 267 | // log(16 * 16) - 2, log(16) - 4 268 | __local float *pWSM = SM + ((tid & 0x0c) << 6) + ((tid & 0xf0) << 0) + (tid & 0x03); 269 | __local float *pRSM = SM + ((tid & 0xf0) << 4) + tidlow; 270 | int oh = h + pad, ow = w + pad, on = n; 271 | int ok = k - tidlow + ((tid & 0xf0) >> 4); 272 | __global float *pO = outputs + ((ok * P + oh) * Q + ow) * N + on; 273 | 274 | bool preds[2][2]; 275 | for (int i = 0; i < 2; ++i) { 276 | for (int j = 0; j < 2; ++j) { 277 | preds[i][j] = on < N && 0 <= oh + i && oh + i < P && 0 <= ow + j && ow + j < Q; 278 | } 279 | } 280 | 281 | { 282 | for (int i = 0; i < 4; ++i) { 283 | for (int j = 0; j < 4; ++j) { 284 | // log(4 * 16 * 16) 285 | pWSM[(i << 10) + (j << 2)] = r[i][j]; 286 | } 287 | } 288 | 289 | barrier(CLK_LOCAL_MEM_FENCE); 290 | 291 | float m[4][4], TM[4][2], M[2][2]; 292 | for (int i = 0; i < 4; ++i) { 293 | for (int j = 0; j < 4; ++j) { 294 | m[i][j] = pRSM[(i * 4 + j) * 16]; 295 | } 296 | } 297 | 298 | barrier(CLK_LOCAL_MEM_FENCE); 299 | 300 | TM[0][0] = m[0][0] + m[0][1] + m[0][2]; 301 | TM[0][1] = m[0][1] - m[0][2] - m[0][3]; 302 | TM[1][0] = m[1][0] + m[1][1] + m[1][2]; 303 | TM[1][1] = m[1][1] - m[1][2] - m[1][3]; 304 | TM[2][0] = m[2][0] + m[2][1] + m[2][2]; 305 | TM[2][1] = m[2][1] - m[2][2] - m[2][3]; 306 | TM[3][0] = m[3][0] + m[3][1] + m[3][2]; 307 | TM[3][1] = m[3][1] - m[3][2] - m[3][3]; 308 | 309 | M[0][0] = TM[0][0] + TM[1][0] + TM[2][0]; 310 | M[0][1] = TM[0][1] + TM[1][1] + TM[2][1]; 311 | M[1][0] = TM[1][0] - TM[2][0] - TM[3][0]; 312 | M[1][1] = TM[1][1] - TM[2][1] - TM[3][1]; 313 | 314 | for (int i = 0; i < 2; ++i) { 315 | for (int j = 0; j < 2; ++j) { 316 | if (ok < K && preds[i][j]) { 317 | pO[(i * Q + j) * N] = M[i][j] + bias[ok]; 318 | } 319 | } 320 | } 321 | } 322 | } 323 | } 324 | 325 | /* 326 | * inputs dim = (C, H, W, N) 327 | * outputs dim = (K, P, Q, N) 328 | * filters dim = (C, 3, 3, K) 329 | * bias dim = (K) 330 | * global_work_size = (TP * TQ * BN * BK * 256, TK / BK, TN / BN) 331 | * local_work_size = (256) 332 | */ 333 | __kernel void winograd_2x2_3x3_32x32( 334 | __global float *inputs, 335 | __global float *outputs, 336 | __global float *filters, 337 | __global float *bias, 338 | int N, 339 | int C, int H, int W, 340 | int K, int P, int Q, 341 | int pad, 342 | int TP, int TQ, int BN, int BK, 343 | int TPmask, int TPwidth, int TPshift, 344 | int TQmask, int TQwidth, int TQshift, 345 | int Nmask, int Nwidth 346 | ) { 347 | int tptqbnbk = get_group_id(0); 348 | int tp = tptqbnbk / (TQ * BN * BK); 349 | int tqbnbk = tptqbnbk - tp * (TQ * BN * BK); 350 | int tq = tqbnbk / (BN * BK); 351 | int bnbk = tqbnbk - tq * (BN * BK); 352 | int bn = bnbk / (BK); 353 | int bk = bnbk - bn * (BK); 354 | 355 | int tid = get_local_id(0); 356 | int tid32 = tid & 31; 357 | int c = (tid & 0x60) >> 5; // 01100000 358 | int ci = c - (C & 3 ? 4 - (C & 3) : 0); 359 | tp = (tp << TPwidth) + ((tid & TPmask) >> TPshift); 360 | tq = (tq << TQwidth) + ((tid & TQmask) >> TQshift); 361 | int h = (tp << 1) - pad, w = (tq << 1) - pad; 362 | int n = ((get_group_id(2) * BN + bn) << Nwidth) + (tid & Nmask); 363 | int k = ((get_group_id(1) * BK + bk) << 5) + tid32; 364 | 365 | __local float SM[8 * 16 * 32]; 366 | __local float *pRSV = SM + ((tid & 0xf0) << 1) + (tid & 0x3); 367 | __local float *pRSU = SM + 4 * 16 * 32 + ((tid & 0xf0) << 1) + ((tid & 0xc) >> 2); 368 | 369 | float r[8][8], rA[8], rB[8]; 370 | for (int i = 0; i < 8; ++i) { 371 | for (int j = 0; j < 8; ++j) { 372 | r[i][j] = 0; 373 | } 374 | } 375 | 376 | if (tid < 128) { // image transform 377 | float v[4][4], TV[4][4], V[4][4]; 378 | 379 | bool preds[4][4]; 380 | for (int i = 0; i < 4; ++i) { 381 | for (int j = 0; j < 4; ++j) { 382 | preds[i][j] = n < N && 0 <= h + i && h + i < H && 0 <= w + j && w + j < W; 383 | } 384 | } 385 | 386 | __global float *pV = inputs + ((ci * H + h) * W + w) * N + n; 387 | for (int i = 0; i < 4; ++i) { 388 | for (int j = 0; j < 4; ++j) { 389 | v[i][j] = ci >= 0 && preds[i][j] ? pV[(i * W + j) * N] : 0; 390 | } 391 | } 392 | 393 | __local float *pWSV = SM + c * 16 * 32 + tid32; 394 | while (true) { 395 | TV[0][0] = v[0][0] - v[2][0]; 396 | TV[0][1] = v[0][1] - v[2][1]; 397 | TV[0][2] = v[0][2] - v[2][2]; 398 | TV[0][3] = v[0][3] - v[2][3]; 399 | 400 | TV[3][0] = v[1][0] - v[3][0]; 401 | TV[3][1] = v[1][1] - v[3][1]; 402 | TV[3][2] = v[1][2] - v[3][2]; 403 | TV[3][3] = v[1][3] - v[3][3]; 404 | 405 | TV[1][0] = v[1][0] + v[2][0]; 406 | TV[1][1] = v[1][1] + v[2][1]; 407 | TV[1][2] = v[1][2] + v[2][2]; 408 | TV[1][3] = v[1][3] + v[2][3]; 409 | 410 | TV[2][0] = v[2][0] - v[1][0]; 411 | TV[2][1] = v[2][1] - v[1][1]; 412 | TV[2][2] = v[2][2] - v[1][2]; 413 | TV[2][3] = v[2][3] - v[1][3]; 414 | 415 | V[0][0] = TV[0][0] - TV[0][2]; 416 | V[0][3] = TV[0][1] - TV[0][3]; 417 | V[3][0] = TV[3][0] - TV[3][2]; 418 | V[3][3] = TV[3][1] - TV[3][3]; 419 | 420 | V[1][0] = TV[1][0] - TV[1][2]; 421 | V[2][0] = TV[2][0] - TV[2][2]; 422 | V[1][3] = TV[1][1] - TV[1][3]; 423 | V[2][3] = TV[2][1] - TV[2][3]; 424 | 425 | V[2][1] = TV[2][1] + TV[2][2]; 426 | V[2][2] = TV[2][2] - TV[2][1]; 427 | 428 | V[0][1] = TV[0][1] + TV[0][2]; 429 | V[0][2] = TV[0][2] - TV[0][1]; 430 | V[1][1] = TV[1][1] + TV[1][2]; 431 | V[1][2] = TV[1][2] - TV[1][1]; 432 | V[3][1] = TV[3][1] + TV[3][2]; 433 | V[3][2] = TV[3][2] - TV[3][1]; 434 | 435 | for (int i = 0; i < 4; ++i) { 436 | for (int j = 0; j < 4; ++j) { 437 | pWSV[(i * 4 + j) * 32] = V[i][j]; 438 | } 439 | } 440 | 441 | barrier(CLK_LOCAL_MEM_FENCE); 442 | 443 | for (int l = 0; l < 4; ++l) { 444 | for (int i = 0; i < 8; ++i) { 445 | rA[i] = pRSU[l * 512 + i * 4]; 446 | rB[i] = pRSV[l * 512 + i * 4]; 447 | } 448 | for (int i = 0; i < 8; ++i) { 449 | for (int j = 0; j < 8; ++j) { 450 | r[i][j] += rA[i] * rB[j]; 451 | } 452 | } 453 | } 454 | 455 | barrier(CLK_LOCAL_MEM_FENCE); 456 | 457 | ci += 4; 458 | if (ci >= C) break; 459 | pV += 4 * H * W * N; 460 | 461 | for (int i = 0; i < 4; ++i) { 462 | for (int j = 0; j < 4; ++j) { 463 | v[i][j] = preds[i][j] ? pV[(i * W + j) * N] : 0; 464 | } 465 | } 466 | } 467 | } else { // filter transform 468 | float u[3][3], TU[4][3], TA[3], TB[4], U[4][4]; 469 | 470 | bool pred = k < K; 471 | 472 | __global float *pU = filters + ci * 3 * 3 * K + k; 473 | for (int i = 0; i < 3; ++i) { 474 | for (int j = 0; j < 3; ++j) { 475 | u[i][j] = ci >= 0 && pred ? pU[(i * 3 + j) * K] : 0; 476 | } 477 | } 478 | 479 | __local float *pWSU = SM + (c + 4) * 16 * 32 + tid32; 480 | while (true) { 481 | TA[0] = (u[0][0] + u[2][0]) * 0.5; 482 | TA[1] = (u[0][1] + u[2][1]) * 0.5; 483 | TA[2] = (u[0][2] + u[2][2]) * 0.5; 484 | TU[0][0] = u[0][0]; 485 | TU[0][1] = u[0][1]; 486 | TU[0][2] = u[0][2]; 487 | TU[3][0] = u[2][0]; 488 | TU[3][1] = u[2][1]; 489 | TU[3][2] = u[2][2]; 490 | TU[1][0] = TA[0] + u[1][0] * 0.5; 491 | TU[2][0] = TA[0] - u[1][0] * 0.5; 492 | TU[1][1] = TA[1] + u[1][1] * 0.5; 493 | TU[2][1] = TA[1] - u[1][1] * 0.5; 494 | TU[1][2] = TA[2] + u[1][2] * 0.5; 495 | TU[2][2] = TA[2] - u[1][2] * 0.5; 496 | TB[0] = (TU[0][0] + TU[0][2]) * 0.5; 497 | TB[1] = (TU[1][0] + TU[1][2]) * 0.5; 498 | TB[2] = (TU[2][0] + TU[2][2]) * 0.5; 499 | TB[3] = (TU[3][0] + TU[3][2]) * 0.5; 500 | U[0][0] = TU[0][0]; 501 | U[0][3] = TU[0][2]; 502 | U[3][0] = TU[3][0]; 503 | U[3][3] = TU[3][2]; 504 | U[1][0] = TU[1][0]; 505 | U[2][0] = TU[2][0]; 506 | U[1][3] = TU[1][2]; 507 | U[2][3] = TU[2][2]; 508 | U[1][1] = TB[1] + TU[1][1] * 0.5; 509 | U[1][2] = TB[1] - TU[1][1] * 0.5; 510 | U[2][1] = TB[2] + TU[2][1] * 0.5; 511 | U[2][2] = TB[2] - TU[2][1] * 0.5; 512 | U[0][1] = TB[0] + TU[0][1] * 0.5; 513 | U[0][2] = TB[0] - TU[0][1] * 0.5; 514 | U[3][1] = TB[3] + TU[3][1] * 0.5; 515 | U[3][2] = TB[3] - TU[3][1] * 0.5; 516 | 517 | for (int i = 0; i < 4; ++i) { 518 | for (int j = 0; j < 4; ++j) { 519 | pWSU[(i * 4 + j) * 32] = U[i][j]; 520 | } 521 | } 522 | 523 | barrier(CLK_LOCAL_MEM_FENCE); 524 | 525 | for (int l = 0; l < 4; ++l) { 526 | for (int i = 0; i < 8; ++i) { 527 | rA[i] = pRSU[l * 512 + i * 4]; 528 | rB[i] = pRSV[l * 512 + i * 4]; 529 | } 530 | for (int i = 0; i < 8; ++i) { 531 | for (int j = 0; j < 8; ++j) { 532 | r[i][j] += rA[i] * rB[j]; 533 | } 534 | } 535 | } 536 | 537 | barrier(CLK_LOCAL_MEM_FENCE); 538 | 539 | ci += 4; 540 | if (ci >= C) break; 541 | pU += 4 * 3 * 3 * K; 542 | 543 | for (int i = 0; i < 3; ++i) { 544 | for (int j = 0; j < 3; ++j) { 545 | u[i][j] = pred ? pU[(i * 3 + j) * K] : 0; 546 | } 547 | } 548 | } 549 | } 550 | 551 | // inverse transform 552 | { 553 | __local float *pWSM = SM + ((tid & 0x0c) << 7) + ((tid & 0xf0) << 1) + (tid & 0x03); 554 | __local float *pRSM = SM + ((tid & 0xe0) << 4) + tid32; 555 | int oh = h + pad, ow = w + pad, on = n; 556 | int ok = k - tid32 + ((tid & 0xe0) >> 5); 557 | __global float *pO = outputs + ((ok * P + oh) * Q + ow) * N + on; 558 | 559 | bool preds[2][2]; 560 | for (int i = 0; i < 2; ++i) { 561 | for (int j = 0; j < 2; ++j) { 562 | preds[i][j] = on < N && 0 <= oh + i && oh + i < P && 0 <= ow + j && ow + j < Q; 563 | } 564 | } 565 | 566 | for (int l = 0; l < 4; ++l) { 567 | for (int i = 0; i < 2; ++i) { 568 | for (int j = 0; j < 8; ++j) { 569 | pWSM[(i << 11) + (j << 2)] = r[l * 2 + i][j]; 570 | } 571 | } 572 | 573 | barrier(CLK_LOCAL_MEM_FENCE); 574 | 575 | float m[4][4], TM[4][2], M[2][2]; 576 | for (int i = 0; i < 4; ++i) { 577 | for (int j = 0; j < 4; ++j) { 578 | m[i][j] = pRSM[(i * 4 + j) * 32]; 579 | } 580 | } 581 | 582 | barrier(CLK_LOCAL_MEM_FENCE); 583 | 584 | TM[0][0] = m[0][0] + m[0][1] + m[0][2]; 585 | TM[0][1] = m[0][1] - m[0][2] - m[0][3]; 586 | TM[1][0] = m[1][0] + m[1][1] + m[1][2]; 587 | TM[1][1] = m[1][1] - m[1][2] - m[1][3]; 588 | TM[2][0] = m[2][0] + m[2][1] + m[2][2]; 589 | TM[2][1] = m[2][1] - m[2][2] - m[2][3]; 590 | TM[3][0] = m[3][0] + m[3][1] + m[3][2]; 591 | TM[3][1] = m[3][1] - m[3][2] - m[3][3]; 592 | 593 | M[0][0] = TM[0][0] + TM[1][0] + TM[2][0]; 594 | M[0][1] = TM[0][1] + TM[1][1] + TM[2][1]; 595 | M[1][0] = TM[1][0] - TM[2][0] - TM[3][0]; 596 | M[1][1] = TM[1][1] - TM[2][1] - TM[3][1]; 597 | 598 | for (int i = 0; i < 2; ++i) { 599 | for (int j = 0; j < 2; ++j) { 600 | if (ok < K && preds[i][j]) { 601 | pO[(i * Q + j) * N] = M[i][j] + bias[ok]; 602 | } 603 | } 604 | } 605 | ok += 8; 606 | pO += 8 * P * Q * N; 607 | } 608 | } 609 | } 610 | 611 | typedef float NNType; 612 | __kernel void CCF( 613 | __global NNType * inputs, 614 | __global NNType * outputs, 615 | __global NNType * filters, 616 | __global NNType * bias, 617 | uint width, 618 | uint height, 619 | uint depth, 620 | uint prev_width, 621 | uint prev_height, 622 | uint prev_depth, 623 | uint filter_width, 624 | uint filter_height, 625 | uint padding_size, 626 | uint stride) 627 | { 628 | uint j = get_global_id(0) % width; 629 | uint i = get_global_id(0) / width; 630 | uint a = get_global_id(1); 631 | uint batch_id = get_global_id(2); 632 | __global NNType * input = inputs + batch_id * prev_depth * prev_width * prev_height; 633 | __global NNType * output = outputs + batch_id * depth * width * height; 634 | uint from_i = i * stride; 635 | uint from_j = j * stride; 636 | //for(uint a = 0; a < depth; a++) 637 | { 638 | NNType sum = 0; 639 | for(uint b = 0; b < prev_depth; b++) 640 | { 641 | for(uint fi = 0; fi < filter_height; fi++) 642 | { 643 | for(uint fj = 0; fj < filter_width; fj++) 644 | { 645 | int iin = -padding_size + from_i + fi; 646 | int jin = -padding_size + from_j + fj; 647 | NNType x = (iin >= 0 && iin < prev_height && jin >= 0 && jin < prev_width) 648 | ? input[b * prev_width * prev_height + iin * prev_width + jin] 649 | : 0; 650 | NNType f = filters[ (a * prev_depth + b) * (filter_height * filter_width) + fi * filter_width + fj ]; 651 | sum += x * f; 652 | } 653 | } 654 | } 655 | sum += bias[a]; 656 | output[a * width * height + i * width + j] = sum; 657 | } 658 | } 659 | 660 | #define CONV_BK 16 661 | #define CONV_BKH 8 662 | #define CONV_BIJ 64 663 | #define CONV_LXY 16 664 | #define CONV_RXY 4 665 | 666 | // if x <= 4608 = 512 * 9 667 | #define DIV3(x) (((x)*5462)>>14) 668 | 669 | #define GEMM44(u, v) \ 670 | rA[0] = smA[u][lx * CONV_RXY]; \ 671 | rA[1] = smA[u][lx * CONV_RXY + 1]; \ 672 | rB[0] = smB[u][ly * CONV_RXY]; \ 673 | rB[1] = smB[u][ly * CONV_RXY + 1]; \ 674 | for (int ki = u; ki < v; ++ki) { \ 675 | rB[2] = smB[ki][ly * CONV_RXY + 2]; \ 676 | rB[3] = smB[ki][ly * CONV_RXY + 3]; \ 677 | accum[0] += rA[0] * rB[0]; \ 678 | accum[1] += rA[0] * rB[1]; \ 679 | accum[4] += rA[1] * rB[0]; \ 680 | accum[5] += rA[1] * rB[1]; \ 681 | rA[2] = smA[ki][lx * CONV_RXY + 2]; \ 682 | rA[3] = smA[ki][lx * CONV_RXY + 3]; \ 683 | accum[2] += rA[0] * rB[2]; \ 684 | accum[3] += rA[0] * rB[3]; \ 685 | accum[6] += rA[1] * rB[2]; \ 686 | accum[7] += rA[1] * rB[3]; \ 687 | rA[0] = smA[ki + 1][lx * CONV_RXY]; \ 688 | rA[1] = smA[ki + 1][lx * CONV_RXY + 1]; \ 689 | accum[8] += rA[2] * rB[0]; \ 690 | accum[9] += rA[2] * rB[1]; \ 691 | accum[12] += rA[3] * rB[0]; \ 692 | accum[13] += rA[3] * rB[1]; \ 693 | rB[0] = smB[ki + 1][ly * CONV_RXY]; \ 694 | rB[1] = smB[ki + 1][ly * CONV_RXY + 1]; \ 695 | accum[10] += rA[2] * rB[2]; \ 696 | accum[11] += rA[2] * rB[3]; \ 697 | accum[14] += rA[3] * rB[2]; \ 698 | accum[15] += rA[3] * rB[3]; \ 699 | } 700 | 701 | // global size IA * KA 702 | // local size 256 703 | __kernel void conv_preA(__global float *AU, __global float *A, int KU, int IA, int KA) { 704 | int gid = get_global_id(0); 705 | int bn = gid / (KA * CONV_BIJ), bo = gid - bn * (KA * CONV_BIJ); 706 | int k = bo / CONV_BIJ, i = bo % CONV_BIJ + bn * CONV_BIJ; 707 | A[gid] = k < KU ? AU[i * KU + k] : 0.0f; 708 | } 709 | 710 | // global size JA, IA, batch 711 | // local size 256, 1, 1 712 | __kernel void conv_postC(__global float *C, __global float *CU, int IA, int JA, int IU, int JU) { 713 | int c = get_global_id(2), i = get_global_id(1), j = get_global_id(0); 714 | if (j < JU) { 715 | CU[(c * IU + i) * JU + j] = C[(c * IA + i) * JA + j]; 716 | } 717 | } 718 | 719 | // global size (JA / CONV_RXY), (IA / CONV_RXY), batch 720 | // local size 16, 16, 1 721 | __kernel void conv(__global float *A, __global float *B, __global float *C, __global float *D, int K, int IA, int JA, int KA, int CH, int N) { 722 | // +1 prevent overflow in innermost loop 723 | __local float smA[CONV_BK + 1][CONV_BIJ]; 724 | __local float smB[CONV_BK + 1][CONV_BIJ]; 725 | float rA[CONV_RXY], rB[CONV_RXY], accum[CONV_RXY * CONV_RXY] = {0}; 726 | int gb = get_group_id(2); 727 | int gi = get_group_id(1), gj = get_group_id(0); 728 | int lx = get_local_id(1), ly = get_local_id(0); 729 | int lid = lx * CONV_LXY + ly; 730 | // CONV_BIJ 731 | int smx = lid >> 6, smy = lid & 63; 732 | int jj = gj * CONV_BIJ + smy, jf = jj < N * N; 733 | int hb = jj / N - 1, wb = jj % N - 1; 734 | int kk = smx, kk3, kk9, h, w; 735 | A += (gi * KA + smx) * CONV_BIJ + smy; 736 | B += gb * CH * N * N; 737 | 738 | smA[smx][smy] = A[0]; 739 | kk3 = DIV3(kk), kk9 = DIV3(kk3), w = kk - kk3 * 3 + wb, h = kk3 - kk9 * 3 + hb; 740 | smB[smx][smy] = jf && kk9 < CH && 0 <= h && h < N && 0 <= w && w < N ? B[(kk9 * N + h) * N + w] : 0.0f, kk += 4; 741 | smA[smx + 4][smy] = A[CONV_BIJ * 4]; 742 | kk3 = DIV3(kk), kk9 = DIV3(kk3), w = kk - kk3 * 3 + wb, h = kk3 - kk9 * 3 + hb; 743 | smB[smx + 4][smy] = jf && kk9 < CH && 0 <= h && h < N && 0 <= w && w < N ? B[(kk9 * N + h) * N + w] : 0.0f, kk += 4; 744 | for (; ; --K) { 745 | barrier(CLK_LOCAL_MEM_FENCE); 746 | 747 | smA[smx + 8][smy] = A[CONV_BIJ * 8]; 748 | kk3 = DIV3(kk), kk9 = DIV3(kk3), w = kk - kk3 * 3 + wb, h = kk3 - kk9 * 3 + hb; 749 | smB[smx + 8][smy] = jf && kk9 < CH && 0 <= h && h < N && 0 <= w && w < N ? B[(kk9 * N + h) * N + w] : 0.0f, kk += 4; 750 | 751 | GEMM44(0, 4); 752 | 753 | smA[smx + 12][smy] = A[CONV_BIJ * 12]; 754 | kk3 = DIV3(kk), kk9 = DIV3(kk3), w = kk - kk3 * 3 + wb, h = kk3 - kk9 * 3 + hb; 755 | smB[smx + 12][smy] = jf && kk9 < CH && 0 <= h && h < N && 0 <= w && w < N ? B[(kk9 * N + h) * N + w] : 0.0f, kk += 4; 756 | A += CONV_BIJ * 16; 757 | 758 | GEMM44(4, 8); 759 | 760 | barrier(CLK_LOCAL_MEM_FENCE); 761 | 762 | if (K > 1) { 763 | smA[smx][smy] = A[0]; 764 | kk3 = DIV3(kk), kk9 = DIV3(kk3), w = kk - kk3 * 3 + wb, h = kk3 - kk9 * 3 + hb; 765 | smB[smx][smy] = jf && kk9 < CH && 0 <= h && h < N && 0 <= w && w < N ? B[(kk9 * N + h) * N + w] : 0.0f, kk += 4; 766 | } 767 | 768 | GEMM44(8, 12); 769 | 770 | if (K > 1) { 771 | smA[smx + 4][smy] = A[CONV_BIJ * 4]; 772 | kk3 = DIV3(kk), kk9 = DIV3(kk3), w = kk - kk3 * 3 + wb, h = kk3 - kk9 * 3 + hb; 773 | smB[smx + 4][smy] = jf && kk9 < CH && 0 <= h && h < N && 0 <= w && w < N ? B[(kk9 * N + h) * N + w] : 0.0f, kk += 4; 774 | } 775 | 776 | GEMM44(12, 16); 777 | 778 | if (K == 1) break; 779 | } 780 | C += (gb * IA + gi * CONV_BIJ + lx * CONV_RXY) * JA + gj * CONV_BIJ + ly * CONV_RXY; 781 | D += gi * CONV_BIJ + lx * CONV_RXY; 782 | C[0] = max(accum[0] + D[0], 0.0f); 783 | C[1] = max(accum[1] + D[0], 0.0f); 784 | C[2] = max(accum[2] + D[0], 0.0f); 785 | C[3] = max(accum[3] + D[0], 0.0f); 786 | C[JA] = max(accum[4] + D[1], 0.0f); 787 | C[JA + 1] = max(accum[5] + D[1], 0.0f); 788 | C[JA + 2] = max(accum[6] + D[1], 0.0f); 789 | C[JA + 3] = max(accum[7] + D[1], 0.0f); 790 | C[JA * 2] = max(accum[8] + D[2], 0.0f); 791 | C[JA * 2 + 1] = max(accum[9] + D[2], 0.0f); 792 | C[JA * 2 + 2] = max(accum[10] + D[2], 0.0f); 793 | C[JA * 2 + 3] = max(accum[11] + D[2], 0.0f); 794 | C[JA * 3] = max(accum[12] + D[3], 0.0f); 795 | C[JA * 3 + 1] = max(accum[13] + D[3], 0.0f); 796 | C[JA * 3 + 2] = max(accum[14] + D[3], 0.0f); 797 | C[JA * 3 + 3] = max(accum[15] + D[3], 0.0f); 798 | } 799 | 800 | /* 801 | * inputs dim = (N, C, H, W) 802 | * outputs dim = (16, C, N, TP, TQ) 803 | * global_work_size = {_ceil(N * C * TP * TQ, 256)} 804 | * local_work_size = {256} 805 | */ 806 | __kernel void winograd_2x2_3x3_data_transform( 807 | __global float *inputs, 808 | __global float *outputs, 809 | int N, int C, int H, int W, 810 | int pad, 811 | int TP, int TQ 812 | ) { 813 | int nctptq = get_global_id(0); 814 | int n = nctptq / (C * TP * TQ); 815 | if (n >= N) return; 816 | int ctptq = nctptq - n * (C * TP * TQ); 817 | int c = ctptq / (TP * TQ); 818 | int tptq = ctptq - c * (TP * TQ); 819 | int tp = tptq / (TQ); 820 | int tq = tptq - tp * (TQ); 821 | int h = tp * 2 - pad, w = tq * 2 - pad; 822 | float v[4][4], TV[4][4], V[4][4]; 823 | 824 | inputs += ((n * C + c) * H + h) * W + w; 825 | for (int i = 0; i < 4; ++i) { 826 | for (int j = 0; j < 4; ++j) { 827 | v[i][j] = 0 <= h + i && h + i < H && 0 <= w + j && w + j < W ? inputs[i * W + j] : 0; 828 | } 829 | } 830 | 831 | TV[0][0] = v[0][0] - v[2][0]; 832 | TV[0][1] = v[0][1] - v[2][1]; 833 | TV[0][2] = v[0][2] - v[2][2]; 834 | TV[0][3] = v[0][3] - v[2][3]; 835 | TV[1][0] = v[1][0] + v[2][0]; 836 | TV[1][1] = v[1][1] + v[2][1]; 837 | TV[1][2] = v[1][2] + v[2][2]; 838 | TV[1][3] = v[1][3] + v[2][3]; 839 | TV[2][0] = v[2][0] - v[1][0]; 840 | TV[2][1] = v[2][1] - v[1][1]; 841 | TV[2][2] = v[2][2] - v[1][2]; 842 | TV[2][3] = v[2][3] - v[1][3]; 843 | TV[3][0] = v[1][0] - v[3][0]; 844 | TV[3][1] = v[1][1] - v[3][1]; 845 | TV[3][2] = v[1][2] - v[3][2]; 846 | TV[3][3] = v[1][3] - v[3][3]; 847 | 848 | V[0][0] = TV[0][0] - TV[0][2]; 849 | V[0][1] = TV[0][1] + TV[0][2]; 850 | V[0][2] = TV[0][2] - TV[0][1]; 851 | V[0][3] = TV[0][1] - TV[0][3]; 852 | V[1][0] = TV[1][0] - TV[1][2]; 853 | V[1][1] = TV[1][1] + TV[1][2]; 854 | V[1][2] = TV[1][2] - TV[1][1]; 855 | V[1][3] = TV[1][1] - TV[1][3]; 856 | V[2][0] = TV[2][0] - TV[2][2]; 857 | V[2][1] = TV[2][1] + TV[2][2]; 858 | V[2][2] = TV[2][2] - TV[2][1]; 859 | V[2][3] = TV[2][1] - TV[2][3]; 860 | V[3][0] = TV[3][0] - TV[3][2]; 861 | V[3][1] = TV[3][1] + TV[3][2]; 862 | V[3][2] = TV[3][2] - TV[3][1]; 863 | V[3][3] = TV[3][1] - TV[3][3]; 864 | 865 | outputs += ((c * N + n) * TP + tp) * TQ + tq; 866 | for (int i = 0; i < 4; ++i) { 867 | for (int j = 0; j < 4; ++j) { 868 | outputs[0] = V[i][j]; 869 | outputs += C * N * TP * TQ; 870 | } 871 | } 872 | } 873 | 874 | /* 875 | * inputs dim = (K, C, 3, 3) 876 | * outputs dim = (16, K, C) 877 | * global_work_size = {_ceil(K * C, 256)} 878 | * local_work_size = {256} 879 | */ 880 | __kernel void winograd_2x2_3x3_filter_transform( 881 | __global float *inputs, 882 | __global float *outputs, 883 | int K, int C 884 | ) { 885 | int kc = get_global_id(0); 886 | int k = kc / (C); 887 | if (k >= K) return; 888 | int c = kc - k * (C); 889 | float u[3][3], TU[4][3], TA[3], TB[4], U[4][4]; 890 | 891 | inputs += (k * C + c) * 3 * 3; 892 | for (int i = 0; i < 3; ++i) { 893 | for (int j = 0; j < 3; ++j) { 894 | u[i][j] = inputs[i * 3 + j]; 895 | } 896 | } 897 | 898 | TA[0] = (u[0][0] + u[2][0]) * 0.5; 899 | TA[1] = (u[0][1] + u[2][1]) * 0.5; 900 | TA[2] = (u[0][2] + u[2][2]) * 0.5; 901 | TU[0][0] = u[0][0]; 902 | TU[0][1] = u[0][1]; 903 | TU[0][2] = u[0][2]; 904 | TU[3][0] = u[2][0]; 905 | TU[3][1] = u[2][1]; 906 | TU[3][2] = u[2][2]; 907 | TU[1][0] = TA[0] + u[1][0] * 0.5; 908 | TU[2][0] = TA[0] - u[1][0] * 0.5; 909 | TU[1][1] = TA[1] + u[1][1] * 0.5; 910 | TU[2][1] = TA[1] - u[1][1] * 0.5; 911 | TU[1][2] = TA[2] + u[1][2] * 0.5; 912 | TU[2][2] = TA[2] - u[1][2] * 0.5; 913 | TB[0] = (TU[0][0] + TU[0][2]) * 0.5; 914 | TB[1] = (TU[1][0] + TU[1][2]) * 0.5; 915 | TB[2] = (TU[2][0] + TU[2][2]) * 0.5; 916 | TB[3] = (TU[3][0] + TU[3][2]) * 0.5; 917 | U[0][0] = TU[0][0]; 918 | U[0][3] = TU[0][2]; 919 | U[1][0] = TU[1][0]; 920 | U[1][3] = TU[1][2]; 921 | U[2][0] = TU[2][0]; 922 | U[2][3] = TU[2][2]; 923 | U[3][0] = TU[3][0]; 924 | U[3][3] = TU[3][2]; 925 | U[0][1] = TB[0] + TU[0][1] * 0.5; 926 | U[0][2] = TB[0] - TU[0][1] * 0.5; 927 | U[1][1] = TB[1] + TU[1][1] * 0.5; 928 | U[1][2] = TB[1] - TU[1][1] * 0.5; 929 | U[2][1] = TB[2] + TU[2][1] * 0.5; 930 | U[2][2] = TB[2] - TU[2][1] * 0.5; 931 | U[3][1] = TB[3] + TU[3][1] * 0.5; 932 | U[3][2] = TB[3] - TU[3][1] * 0.5; 933 | 934 | outputs += k * C + c; 935 | for (int i = 0; i < 4; ++i) { 936 | for (int j = 0; j < 4; ++j) { 937 | outputs[0] = U[i][j]; 938 | outputs += K * C; 939 | } 940 | } 941 | } 942 | 943 | /* 944 | * inputs dim = (16, K, N, TP, TQ) 945 | * outputs dim = (N, K, P, Q) 946 | * global_work_size = {_ceil(K * N * TP * TQ, 256)} 947 | * local_work_size = {256} 948 | */ 949 | __kernel void winograd_2x2_3x3_inverse_transform( 950 | __global float *inputs, 951 | __global float *outputs, 952 | int N, int K, int P, int Q, 953 | int TP, int TQ 954 | ) { 955 | int kntptq = get_global_id(0); 956 | int k = kntptq / (N * TP * TQ); 957 | if (k >= K) return; 958 | int ntptq = kntptq - k * (N * TP * TQ); 959 | int n = ntptq / (TP * TQ); 960 | int tptq = ntptq - n * (TP * TQ); 961 | int tp = tptq / (TQ); 962 | int tq = tptq - tp * (TQ); 963 | int p = tp * 2, q = tq * 2; 964 | float m[4][4], TM[4][2], M[2][2]; 965 | 966 | inputs += ((k * N + n) * TP + tp) * TQ + tq; 967 | for (int i = 0; i < 4; ++i) { 968 | for (int j = 0; j < 4; ++j) { 969 | m[i][j] = inputs[0]; 970 | inputs += K * N * TP * TQ; 971 | } 972 | } 973 | 974 | TM[0][0] = m[0][0] + m[0][1] + m[0][2]; 975 | TM[0][1] = m[0][1] - m[0][2] - m[0][3]; 976 | TM[1][0] = m[1][0] + m[1][1] + m[1][2]; 977 | TM[1][1] = m[1][1] - m[1][2] - m[1][3]; 978 | TM[2][0] = m[2][0] + m[2][1] + m[2][2]; 979 | TM[2][1] = m[2][1] - m[2][2] - m[2][3]; 980 | TM[3][0] = m[3][0] + m[3][1] + m[3][2]; 981 | TM[3][1] = m[3][1] - m[3][2] - m[3][3]; 982 | 983 | M[0][0] = TM[0][0] + TM[1][0] + TM[2][0]; 984 | M[0][1] = TM[0][1] + TM[1][1] + TM[2][1]; 985 | M[1][0] = TM[1][0] - TM[2][0] - TM[3][0]; 986 | M[1][1] = TM[1][1] - TM[2][1] - TM[3][1]; 987 | 988 | outputs += ((n * K + k) * P + p) * Q + q; 989 | for (int i = 0; i < 2; ++i) { 990 | for (int j = 0; j < 2; ++j) { 991 | if (p + i < P && q + j < Q) { 992 | outputs[i * Q + j] = M[i][j]; 993 | } 994 | } 995 | } 996 | } 997 | 998 | /* 999 | * inputs dim = (K, C, R, S) 1000 | * outputs dim = (C, K, R, S) (reverse in R and S) 1001 | * global_work_size = {_ceil(K * C * R * S, 256)} 1002 | * local_work_size = {256} 1003 | */ 1004 | __kernel void flip_filter( 1005 | __global float *inputs, 1006 | __global float *outputs, 1007 | int K, int C, int R, int S 1008 | ) { 1009 | int kcrs = get_global_id(0); 1010 | int k = kcrs / (C * R * S); 1011 | if (k >= K) return; 1012 | int crs = kcrs - k * (C * R * S); 1013 | int c = crs / (R * S); 1014 | int rs = crs - c * (R * S); 1015 | int r = rs / (S); 1016 | int s = rs - r * (S); 1017 | 1018 | outputs[((c * K + k) * R + (R - r - 1)) * S + (S - s - 1)] = inputs[kcrs]; 1019 | } 1020 | 1021 | /* 1022 | * inputs dim = (N, C, H, W) 1023 | * outputs dim = (16, C, N, TP, TQ) 1024 | * global_work_size = {_ceil(N * C * TP * TQ, 256)} 1025 | * local_work_size = {256} 1026 | */ 1027 | __kernel void winograd_3x3_2x2_data_transform( 1028 | __global float *inputs, 1029 | __global float *outputs, 1030 | int N, int C, int H, int W, 1031 | int pad, 1032 | int TP, int TQ 1033 | ) { 1034 | int nctptq = get_global_id(0); 1035 | int n = nctptq / (C * TP * TQ); 1036 | if (n >= N) return; 1037 | int ctptq = nctptq - n * (C * TP * TQ); 1038 | int c = ctptq / (TP * TQ); 1039 | int tptq = ctptq - c * (TP * TQ); 1040 | int tp = tptq / (TQ); 1041 | int tq = tptq - tp * (TQ); 1042 | int h = tp * 2 - pad, w = tq * 2 - pad; 1043 | float v[4][4], TV[4][4], V[4][4]; 1044 | 1045 | inputs += ((n * C + c) * H + h) * W + w; 1046 | for (int i = 0; i < 4; ++i) { 1047 | for (int j = 0; j < 4; ++j) { 1048 | v[i][j] = 0 <= h + i && h + i < H && 0 <= w + j && w + j < W ? inputs[i * W + j] : 0; 1049 | } 1050 | } 1051 | 1052 | TV[0][0] = v[0][0] - v[2][0]; 1053 | TV[0][1] = v[0][1] - v[2][1]; 1054 | TV[0][2] = v[0][2] - v[2][2]; 1055 | TV[0][3] = v[0][3] - v[2][3]; 1056 | TV[1][0] = v[1][0] + v[2][0]; 1057 | TV[1][1] = v[1][1] + v[2][1]; 1058 | TV[1][2] = v[1][2] + v[2][2]; 1059 | TV[1][3] = v[1][3] + v[2][3]; 1060 | TV[2][0] = v[2][0] - v[1][0]; 1061 | TV[2][1] = v[2][1] - v[1][1]; 1062 | TV[2][2] = v[2][2] - v[1][2]; 1063 | TV[2][3] = v[2][3] - v[1][3]; 1064 | TV[3][0] = v[3][0] - v[1][0]; 1065 | TV[3][1] = v[3][1] - v[1][1]; 1066 | TV[3][2] = v[3][2] - v[1][2]; 1067 | TV[3][3] = v[3][3] - v[1][3]; 1068 | 1069 | V[0][0] = TV[0][0] - TV[0][2]; 1070 | V[1][0] = TV[1][0] - TV[1][2]; 1071 | V[2][0] = TV[2][0] - TV[2][2]; 1072 | V[3][0] = TV[3][0] - TV[3][2]; 1073 | V[0][1] = TV[0][1] + TV[0][2]; 1074 | V[1][1] = TV[1][1] + TV[1][2]; 1075 | V[2][1] = TV[2][1] + TV[2][2]; 1076 | V[3][1] = TV[3][1] + TV[3][2]; 1077 | V[0][2] = TV[0][2] - TV[0][1]; 1078 | V[1][2] = TV[1][2] - TV[1][1]; 1079 | V[2][2] = TV[2][2] - TV[2][1]; 1080 | V[3][2] = TV[3][2] - TV[3][1]; 1081 | V[0][3] = TV[0][3] - TV[0][1]; 1082 | V[1][3] = TV[1][3] - TV[1][1]; 1083 | V[2][3] = TV[2][3] - TV[2][1]; 1084 | V[3][3] = TV[3][3] - TV[3][1]; 1085 | 1086 | outputs += ((c * N + n) * TP + tp) * TQ + tq; 1087 | for (int i = 0; i < 4; ++i) { 1088 | for (int j = 0; j < 4; ++j) { 1089 | outputs[0] = V[i][j]; 1090 | outputs += C * N * TP * TQ; 1091 | } 1092 | } 1093 | } 1094 | 1095 | /* 1096 | * inputs dim = (N, K, P, Q) 1097 | * outputs dim = (16, K, N, TP, TQ) 1098 | * global_work_size = {_ceil(N * K * TP * TQ, 256)} 1099 | * local_work_size = {256} 1100 | */ 1101 | __kernel void winograd_3x3_2x2_filter_transform( 1102 | __global float *inputs, 1103 | __global float *outputs, 1104 | int N, int K, int P, int Q, 1105 | int TP, int TQ 1106 | ) { 1107 | int nktptq = get_global_id(0); 1108 | int n = nktptq / (K * TP * TQ); 1109 | if (n >= N) return; 1110 | int ktptq = nktptq - n * (K * TP * TQ); 1111 | int k = ktptq / (TP * TQ); 1112 | int tptq = ktptq - k * (TP * TQ); 1113 | int tp = tptq / (TQ); 1114 | int tq = tptq - tp * (TQ); 1115 | int p = tp * 2, q = tq * 2; 1116 | float u0[2][2], u1[4][2], u2[4][4]; 1117 | 1118 | inputs += ((n * K + k) * P + p) * Q + q; 1119 | for (int i = 0; i < 2; ++i) { 1120 | for (int j = 0; j < 2; ++j) { 1121 | u0[i][j] = 0 <= p + i && p + i < P && 0 <= q + j && q + j < Q ? inputs[i * Q + j] : 0; 1122 | } 1123 | } 1124 | 1125 | u1[0][0] = u0[0][0]; 1126 | u1[0][1] = u0[0][1]; 1127 | u1[1][0] = (u0[0][0] + u0[1][0]) * 0.5; 1128 | u1[1][1] = (u0[0][1] + u0[1][1]) * 0.5; 1129 | u1[2][0] = (u0[0][0] - u0[1][0]) * 0.5; 1130 | u1[2][1] = (u0[0][1] - u0[1][1]) * 0.5; 1131 | u1[3][0] = u0[1][0]; 1132 | u1[3][1] = u0[1][1]; 1133 | 1134 | u2[0][0] = u1[0][0]; 1135 | u2[1][0] = u1[1][0]; 1136 | u2[2][0] = u1[2][0]; 1137 | u2[3][0] = u1[3][0]; 1138 | u2[0][1] = (u1[0][0] + u1[0][1]) * 0.5; 1139 | u2[1][1] = (u1[1][0] + u1[1][1]) * 0.5; 1140 | u2[2][1] = (u1[2][0] + u1[2][1]) * 0.5; 1141 | u2[3][1] = (u1[3][0] + u1[3][1]) * 0.5; 1142 | u2[0][2] = (u1[0][0] - u1[0][1]) * 0.5; 1143 | u2[1][2] = (u1[1][0] - u1[1][1]) * 0.5; 1144 | u2[2][2] = (u1[2][0] - u1[2][1]) * 0.5; 1145 | u2[3][2] = (u1[3][0] - u1[3][1]) * 0.5; 1146 | u2[0][3] = u1[0][1]; 1147 | u2[1][3] = u1[1][1]; 1148 | u2[2][3] = u1[2][1]; 1149 | u2[3][3] = u1[3][1]; 1150 | 1151 | outputs += ((k * N + n) * TP + tp) * TQ + tq; 1152 | for (int i = 0; i < 4; ++i) { 1153 | for (int j = 0; j < 4; ++j) { 1154 | outputs[0] = u2[i][j]; 1155 | outputs += K * N * TP * TQ; 1156 | } 1157 | } 1158 | } 1159 | 1160 | /* 1161 | * inputs dim = (16, K, C) 1162 | * outputs dim = (K, C, 3, 3) 1163 | * global_work_size = {_ceil(K * C, 256)} 1164 | * local_work_size = {256} 1165 | */ 1166 | __kernel void winograd_3x3_2x2_inverse_transform( 1167 | __global float *inputs, 1168 | __global float *outputs, 1169 | int K, int C 1170 | ) { 1171 | int kc = get_global_id(0); 1172 | int k = kc / (C); 1173 | if (k >= K) return; 1174 | int c = kc - k * (C); 1175 | float m0[4][4], m1[3][4], m2[3][3], mt[4]; 1176 | 1177 | inputs += k * C + c; 1178 | for (int i = 0; i < 4; ++i) { 1179 | for (int j = 0; j < 4; ++j) { 1180 | m0[i][j] = inputs[0]; 1181 | inputs += K * C; 1182 | } 1183 | } 1184 | 1185 | mt[0] = m0[1][0] + m0[2][0]; 1186 | mt[1] = m0[1][1] + m0[2][1]; 1187 | mt[2] = m0[1][2] + m0[2][2]; 1188 | mt[3] = m0[1][3] + m0[2][3]; 1189 | m1[0][0] = mt[0] + m0[0][0]; 1190 | m1[0][1] = mt[1] + m0[0][1]; 1191 | m1[0][2] = mt[2] + m0[0][2]; 1192 | m1[0][3] = mt[3] + m0[0][3]; 1193 | m1[1][0] = m0[1][0] - m0[2][0]; 1194 | m1[1][1] = m0[1][1] - m0[2][1]; 1195 | m1[1][2] = m0[1][2] - m0[2][2]; 1196 | m1[1][3] = m0[1][3] - m0[2][3]; 1197 | m1[2][0] = mt[0] + m0[3][0]; 1198 | m1[2][1] = mt[1] + m0[3][1]; 1199 | m1[2][2] = mt[2] + m0[3][2]; 1200 | m1[2][3] = mt[3] + m0[3][3]; 1201 | 1202 | mt[0] = m1[0][1] + m1[0][2]; 1203 | mt[1] = m1[1][1] + m1[1][2]; 1204 | mt[2] = m1[2][1] + m1[2][2]; 1205 | m2[0][0] = mt[0] + m1[0][0]; 1206 | m2[1][0] = mt[1] + m1[1][0]; 1207 | m2[2][0] = mt[2] + m1[2][0]; 1208 | m2[0][1] = m1[0][1] - m1[0][2]; 1209 | m2[1][1] = m1[1][1] - m1[1][2]; 1210 | m2[2][1] = m1[2][1] - m1[2][2]; 1211 | m2[0][2] = mt[0] + m1[0][3]; 1212 | m2[1][2] = mt[1] + m1[1][3]; 1213 | m2[2][2] = mt[2] + m1[2][3]; 1214 | 1215 | outputs += (k * C + c) * 3 * 3; 1216 | for (int i = 0; i < 3; ++i) { 1217 | for (int j = 0; j < 3; ++j) { 1218 | outputs[i * 3 + j] = m2[i][j]; 1219 | } 1220 | } 1221 | } 1222 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "timer.h" 7 | #include 8 | 9 | #define CHECK_ERROR(err) \ 10 | if (err != CL_SUCCESS) { \ 11 | printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \ 12 | exit(EXIT_FAILURE); \ 13 | } 14 | 15 | cl_int err; 16 | 17 | cl_program create_and_build_program(cl_context context, cl_device_id device, const char *file_name) { 18 | FILE *file = fopen(file_name, "rb"); 19 | if (file == NULL) { 20 | printf("Failed to open %s\n", file_name); 21 | exit(EXIT_FAILURE); 22 | } 23 | fseek(file, 0, SEEK_END); 24 | size_t source_size = ftell(file); 25 | rewind(file); 26 | char *source_code = (char*)malloc(source_size + 1); 27 | fread(source_code, sizeof(char), source_size, file); 28 | source_code[source_size] = '\0'; 29 | fclose(file); 30 | 31 | cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_code, &source_size, &err); 32 | CHECK_ERROR(err); 33 | free(source_code); 34 | 35 | err = clBuildProgram(program, 1, &device, "", NULL, NULL); 36 | { 37 | size_t log_size; 38 | CHECK_ERROR(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); 39 | char *log = (char*)malloc(log_size + 1); 40 | CHECK_ERROR(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL)); 41 | log[log_size] = 0; 42 | printf("Build log :\n%s\n", log); 43 | free(log); 44 | } 45 | CHECK_ERROR(err); 46 | 47 | return program; 48 | } 49 | 50 | int _ceil(int x, int y) { 51 | return (x + y - 1) / y * y; 52 | } 53 | 54 | int _ceil_div(int x, int y) { 55 | return (x + y - 1) / y; 56 | } 57 | 58 | void fillData(float *d, int n) { 59 | for (int i = 0; i < n; ++i) { 60 | d[i] = rand() % 2 / 1.0; 61 | } 62 | } 63 | 64 | void printData(float *d, int N, int C, int H, int W, const char *name) { 65 | printf("%s.shape = (%d, %d, %d, %d)\n", name, N, C, H, W); 66 | for (int n = 0; n < N; ++n) { 67 | for (int c = 0; c < C; ++c) { 68 | printf("(%d, %d, :, :) =\n", n, c); 69 | for (int h = 0; h < H; ++h) { 70 | for (int w = 0; w < W; ++w) { 71 | printf("%f ", d[((n * C + c) * H + h) * W + w]); 72 | } 73 | printf("\n"); 74 | } 75 | } 76 | } 77 | } 78 | 79 | // true if equal, false otherwise 80 | int equalData(float *d0, float *d1, int N, int C, int H, int W) { 81 | for (int n = 0; n < N; ++n) { 82 | for (int c = 0; c < C; ++c) { 83 | for (int h = 0; h < H; ++h) { 84 | for (int w = 0; w < W; ++w) { 85 | int x = ((n * C + c) * H + h) * W + w; 86 | if ((d0[x] + d1[x] != 0 && fabs((d0[x] - d1[x]) / (d0[x] + d1[x])) > 1e-4) 87 | || (d0[x] + d1[x] == 0 && d0[x] != 0)) { 88 | printf("d0 = %f, d1 = %f\n", d0[x], d1[x]); 89 | return 0; 90 | } 91 | } 92 | } 93 | } 94 | } 95 | return 1; 96 | } 97 | 98 | void convolution_cpu(float *inputs, float *outputs, float *filters, float *dx, float *dw, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad) { 99 | timer_start(0); 100 | for (int n = 0; n < N; ++n) { 101 | for (int k = 0; k < K; ++k) { 102 | for (int p = 0; p < P; ++p) { 103 | for (int q = 0; q < Q; ++q) { 104 | float x = 0; 105 | for (int c = 0; c < C; ++c) { 106 | for (int r = 0; r < R; ++r) { 107 | for (int s = 0; s < S; ++s) { 108 | int h = p + r - pad, w = q + s - pad; 109 | if (0 <= h && h < H && 0 <= w && w < W) { 110 | x += inputs[((n * C + c) * H + h) * W + w] * filters[((k * C + c) * R + r) * S + s]; 111 | } 112 | } 113 | } 114 | } 115 | outputs[((n * K + k) * P + p) * Q + q] = x; 116 | } 117 | } 118 | } 119 | } 120 | timer_end(0, "cpu fwd"); 121 | 122 | timer_start(0); 123 | for (int n = 0; n < N; ++n) { 124 | for (int c = 0; c < C; ++c) { 125 | for (int h = 0; h < H; ++h) { 126 | for (int w = 0; w < W; ++w) { 127 | float x = 0; 128 | for (int k = 0; k < K; ++k) { 129 | for (int r = 0; r < R; ++r) { 130 | for (int s = 0; s < S; ++s) { 131 | int p = h - r + pad, q = w - s + pad; 132 | if (0 <= p && p < P && 0 <= q && q < Q) { 133 | x += outputs[((n * K + k) * P + p) * Q + q] * filters[((k * C + c) * R + r) * S + s]; 134 | } 135 | } 136 | } 137 | } 138 | dx[((n * C + c) * H + h) * W + w] = x; 139 | } 140 | } 141 | } 142 | } 143 | timer_end(0, "cpu bwd data"); 144 | 145 | timer_start(0); 146 | for (int k = 0; k < K; ++k) { 147 | for (int c = 0; c < C; ++c) { 148 | for (int r = 0; r < R; ++r) { 149 | for (int s = 0; s < S; ++s) { 150 | float x = 0; 151 | for (int n = 0; n < N; ++n) { 152 | for (int p = 0; p < P; ++p) { 153 | for (int q = 0; q < Q; ++q) { 154 | int h = p + r - pad, w = q + s - pad; 155 | if (0 <= h && h < H && 0 <= w && w < W) { 156 | x += outputs[((n * K + k) * P + p) * Q + q] * inputs[((n * C + c) * H + h) * W + w]; 157 | } 158 | } 159 | } 160 | } 161 | dw[((k * C + c) * R + r) * S + s] = x; 162 | } 163 | } 164 | } 165 | } 166 | timer_end(0, "cpu bwd filter"); 167 | } 168 | 169 | #define CONV_BK 16 170 | #define CONV_BKH 8 171 | #define CONV_BIJ 64 172 | #define CONV_LXY 16 173 | #define CONV_RXY 4 174 | 175 | void convolution_mc(float *inputs, float *outputs, float *filters, float *bias, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 176 | cl_kernel kernel0 = clCreateKernel(program, "conv_preA", &err); 177 | CHECK_ERROR(err); 178 | cl_kernel kernel1 = clCreateKernel(program, "conv_postC", &err); 179 | CHECK_ERROR(err); 180 | cl_kernel kernel2 = clCreateKernel(program, "conv", &err); 181 | CHECK_ERROR(err); 182 | 183 | int IU = K, KU = C * 9, JU = H * W; 184 | int IA = _ceil(IU, CONV_BIJ), KA = _ceil(KU, CONV_BK), JA = _ceil(JU, CONV_BIJ); 185 | int KK = KA / CONV_BK; 186 | 187 | cl_mem filters_pre_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 188 | CHECK_ERROR(err); 189 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (KA * IA), NULL, &err); 190 | CHECK_ERROR(err); 191 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 192 | CHECK_ERROR(err); 193 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (IA * JA), NULL, &err); 194 | CHECK_ERROR(err); 195 | cl_mem outputs_post_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 196 | CHECK_ERROR(err); 197 | cl_mem bias_dev = clCreateBuffer(context, 0, sizeof(float) * (K), NULL, &err); 198 | CHECK_ERROR(err); 199 | 200 | CHECK_ERROR(clEnqueueWriteBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 201 | CHECK_ERROR(clEnqueueWriteBuffer(queue, filters_pre_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 202 | CHECK_ERROR(clEnqueueWriteBuffer(queue, bias_dev, CL_TRUE, 0, sizeof(float) * (K), bias, 0, NULL, NULL)); 203 | 204 | { 205 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &filters_pre_dev)); 206 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &filters_dev)); 207 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &KU)); 208 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &IA)); 209 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &KA)); 210 | size_t gws[] = {IA * KA}; 211 | size_t lws[] = {256}; 212 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 213 | clFinish(queue); 214 | } 215 | 216 | { 217 | timer_start(0); 218 | CHECK_ERROR(clSetKernelArg(kernel2, 0, sizeof(cl_mem), &filters_dev)); 219 | CHECK_ERROR(clSetKernelArg(kernel2, 1, sizeof(cl_mem), &inputs_dev)); 220 | CHECK_ERROR(clSetKernelArg(kernel2, 2, sizeof(cl_mem), &outputs_dev)); 221 | CHECK_ERROR(clSetKernelArg(kernel2, 3, sizeof(cl_mem), &bias_dev)); 222 | CHECK_ERROR(clSetKernelArg(kernel2, 4, sizeof(int), &KK)); 223 | CHECK_ERROR(clSetKernelArg(kernel2, 5, sizeof(int), &IA)); 224 | CHECK_ERROR(clSetKernelArg(kernel2, 6, sizeof(int), &JA)); 225 | CHECK_ERROR(clSetKernelArg(kernel2, 7, sizeof(int), &KA)); 226 | CHECK_ERROR(clSetKernelArg(kernel2, 8, sizeof(int), &C)); 227 | CHECK_ERROR(clSetKernelArg(kernel2, 9, sizeof(int), &H)); // assume H == W 228 | size_t gws[] = {JA / CONV_RXY, IA / CONV_RXY, N}; 229 | size_t lws[] = {CONV_LXY, CONV_LXY, 1}; 230 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel2, 3, NULL, gws, lws, 0, NULL, NULL)); 231 | clFinish(queue); 232 | timer_end(0, "mc"); 233 | } 234 | 235 | { 236 | CHECK_ERROR(clSetKernelArg(kernel1, 0, sizeof(cl_mem), &outputs_dev)); 237 | CHECK_ERROR(clSetKernelArg(kernel1, 1, sizeof(cl_mem), &outputs_post_dev)); 238 | CHECK_ERROR(clSetKernelArg(kernel1, 2, sizeof(int), &IA)); 239 | CHECK_ERROR(clSetKernelArg(kernel1, 3, sizeof(int), &JA)); 240 | CHECK_ERROR(clSetKernelArg(kernel1, 4, sizeof(int), &IU)); 241 | CHECK_ERROR(clSetKernelArg(kernel1, 5, sizeof(int), &JU)); 242 | size_t gws[] = {JA, IA, N}; 243 | size_t lws[] = {CONV_BIJ, 256 / CONV_BIJ, 1}; 244 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel1, 3, NULL, gws, lws, 0, NULL, NULL)); 245 | clFinish(queue); 246 | } 247 | 248 | CHECK_ERROR(clEnqueueReadBuffer(queue, outputs_post_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 249 | 250 | clReleaseMemObject(inputs_dev); 251 | clReleaseMemObject(outputs_dev); 252 | clReleaseMemObject(outputs_post_dev); 253 | clReleaseMemObject(filters_dev); 254 | clReleaseMemObject(filters_pre_dev); 255 | clReleaseMemObject(bias_dev); 256 | 257 | clReleaseKernel(kernel0); 258 | clReleaseKernel(kernel1); 259 | clReleaseKernel(kernel2); 260 | } 261 | 262 | void convolution_wino16(float *inputs, float *outputs, float *filters, float *bias, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 263 | cl_kernel kernel0 = clCreateKernel(program, "NCHW2CHWN", &err); 264 | CHECK_ERROR(err); 265 | cl_kernel kernel1 = clCreateKernel(program, "winograd_2x2_3x3_16x16", &err); 266 | CHECK_ERROR(err); 267 | cl_kernel kernel2 = clCreateKernel(program, "CHWN2NCHW", &err); 268 | CHECK_ERROR(err); 269 | 270 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 271 | CHECK_ERROR(err); 272 | cl_mem inputs_CHWN_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 273 | CHECK_ERROR(err); 274 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 275 | CHECK_ERROR(err); 276 | cl_mem outputs_CHWN_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 277 | CHECK_ERROR(err); 278 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 279 | CHECK_ERROR(err); 280 | cl_mem filters_CHWN_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 281 | CHECK_ERROR(err); 282 | cl_mem bias_dev = clCreateBuffer(context, 0, sizeof(float) * (K), NULL, &err); 283 | CHECK_ERROR(err); 284 | 285 | timer_start(0); 286 | CHECK_ERROR(clEnqueueWriteBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 287 | CHECK_ERROR(clEnqueueWriteBuffer(queue, filters_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 288 | CHECK_ERROR(clEnqueueWriteBuffer(queue, bias_dev, CL_TRUE, 0, sizeof(float) * (K), bias, 0, NULL, NULL)); 289 | //timer_end(0, "wino WriteBuffer"); 290 | 291 | { 292 | timer_start(0); 293 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &inputs_dev)); 294 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &inputs_CHWN_dev)); 295 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &N)); 296 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &C)); 297 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &H)); 298 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &W)); 299 | size_t gws[1] = {_ceil(N * C * H * W, 256)}; 300 | size_t lws[1] = {256}; 301 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 302 | clFinish(queue); 303 | //timer_end(0, "wino input transform"); 304 | } 305 | 306 | { 307 | timer_start(0); 308 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &filters_dev)); 309 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &filters_CHWN_dev)); 310 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &K)); 311 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &C)); 312 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &R)); 313 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &S)); 314 | size_t gws[1] = {_ceil(K * C * R * S, 256)}; 315 | size_t lws[1] = {256}; 316 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 317 | clFinish(queue); 318 | //timer_end(0, "wino filter transform"); 319 | } 320 | 321 | { 322 | timer_start(0); 323 | int BN = 1, BK = 1; 324 | int TPmask, TPwidth, TPshift, TQmask, TQwidth, TQshift, Nmask, Nwidth; 325 | if (N < 2) 326 | TPmask = 0x0c, TPwidth = 2, TPshift = 2, TQmask = 0x03, TQwidth = 2, TQshift = 0, Nmask = 0x00, Nwidth = 0; 327 | else if (N < 4) 328 | TPmask = 0x08, TPwidth = 1, TPshift = 3, TQmask = 0x06, TQwidth = 2, TQshift = 1, Nmask = 0x01, Nwidth = 1; 329 | else if (N < 8) 330 | TPmask = 0x08, TPwidth = 1, TPshift = 3, TQmask = 0x04, TQwidth = 1, TQshift = 2, Nmask = 0x03, Nwidth = 2; 331 | else if (N < 16) 332 | TPmask = 0x00, TPwidth = 0, TPshift = 0, TQmask = 0x08, TQwidth = 1, TQshift = 3, Nmask = 0x07, Nwidth = 3; 333 | else 334 | TPmask = 0x00, TPwidth = 0, TPshift = 0, TQmask = 0x00, TQwidth = 0, TQshift = 0, Nmask = 0x0f, Nwidth = 4; 335 | int TP = _ceil_div(P, 2 << TPwidth), TQ = _ceil_div(Q, 2 << TQwidth); 336 | int TK = _ceil_div(K, 16), TN = _ceil_div(N, 1 << Nwidth); 337 | 338 | CHECK_ERROR(clSetKernelArg(kernel1, 0, sizeof(cl_mem), &inputs_CHWN_dev)); 339 | CHECK_ERROR(clSetKernelArg(kernel1, 1, sizeof(cl_mem), &outputs_CHWN_dev)); 340 | CHECK_ERROR(clSetKernelArg(kernel1, 2, sizeof(cl_mem), &filters_CHWN_dev)); 341 | CHECK_ERROR(clSetKernelArg(kernel1, 3, sizeof(cl_mem), &bias_dev)); 342 | CHECK_ERROR(clSetKernelArg(kernel1, 4, sizeof(int), &N)); 343 | CHECK_ERROR(clSetKernelArg(kernel1, 5, sizeof(int), &C)); 344 | CHECK_ERROR(clSetKernelArg(kernel1, 6, sizeof(int), &H)); 345 | CHECK_ERROR(clSetKernelArg(kernel1, 7, sizeof(int), &W)); 346 | CHECK_ERROR(clSetKernelArg(kernel1, 8, sizeof(int), &K)); 347 | CHECK_ERROR(clSetKernelArg(kernel1, 9, sizeof(int), &P)); 348 | CHECK_ERROR(clSetKernelArg(kernel1, 10, sizeof(int), &Q)); 349 | CHECK_ERROR(clSetKernelArg(kernel1, 11, sizeof(int), &pad)); 350 | CHECK_ERROR(clSetKernelArg(kernel1, 12, sizeof(int), &TP)); 351 | CHECK_ERROR(clSetKernelArg(kernel1, 13, sizeof(int), &TQ)); 352 | CHECK_ERROR(clSetKernelArg(kernel1, 14, sizeof(int), &BN)); 353 | CHECK_ERROR(clSetKernelArg(kernel1, 15, sizeof(int), &BK)); 354 | CHECK_ERROR(clSetKernelArg(kernel1, 16, sizeof(int), &TPmask)); 355 | CHECK_ERROR(clSetKernelArg(kernel1, 17, sizeof(int), &TPwidth)); 356 | CHECK_ERROR(clSetKernelArg(kernel1, 18, sizeof(int), &TPshift)); 357 | CHECK_ERROR(clSetKernelArg(kernel1, 19, sizeof(int), &TQmask)); 358 | CHECK_ERROR(clSetKernelArg(kernel1, 20, sizeof(int), &TQwidth)); 359 | CHECK_ERROR(clSetKernelArg(kernel1, 21, sizeof(int), &TQshift)); 360 | CHECK_ERROR(clSetKernelArg(kernel1, 22, sizeof(int), &Nmask)); 361 | CHECK_ERROR(clSetKernelArg(kernel1, 23, sizeof(int), &Nwidth)); 362 | size_t gws[3] = {TP * TQ * BN * BK * 256, TK / BK, TN / BN}; 363 | size_t lws[3] = {256, 1, 1}; 364 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel1, 3, NULL, gws, lws, 0, NULL, NULL)); 365 | clFinish(queue); 366 | timer_end(0, "wino16"); 367 | } 368 | 369 | { 370 | timer_start(0); 371 | CHECK_ERROR(clSetKernelArg(kernel2, 0, sizeof(cl_mem), &outputs_CHWN_dev)); 372 | CHECK_ERROR(clSetKernelArg(kernel2, 1, sizeof(cl_mem), &outputs_dev)); 373 | CHECK_ERROR(clSetKernelArg(kernel2, 2, sizeof(int), &N)); 374 | CHECK_ERROR(clSetKernelArg(kernel2, 3, sizeof(int), &K)); 375 | CHECK_ERROR(clSetKernelArg(kernel2, 4, sizeof(int), &P)); 376 | CHECK_ERROR(clSetKernelArg(kernel2, 5, sizeof(int), &Q)); 377 | size_t gws[1] = {_ceil(N * K * P * Q, 256)}; 378 | size_t lws[1] = {256}; 379 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, gws, lws, 0, NULL, NULL)); 380 | clFinish(queue); 381 | //timer_end(0, "wino output transform"); 382 | } 383 | 384 | timer_start(0); 385 | CHECK_ERROR(clEnqueueReadBuffer(queue, outputs_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 386 | //timer_end(0, "wino ReadBuffer"); 387 | 388 | clReleaseMemObject(inputs_dev); 389 | clReleaseMemObject(inputs_CHWN_dev); 390 | clReleaseMemObject(outputs_dev); 391 | clReleaseMemObject(outputs_CHWN_dev); 392 | clReleaseMemObject(filters_dev); 393 | clReleaseMemObject(filters_CHWN_dev); 394 | clReleaseMemObject(bias_dev); 395 | 396 | clReleaseKernel(kernel0); 397 | clReleaseKernel(kernel1); 398 | clReleaseKernel(kernel2); 399 | } 400 | 401 | void convolution_wino32(float *inputs, float *outputs, float *filters, float *bias, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 402 | cl_kernel kernel0 = clCreateKernel(program, "NCHW2CHWN", &err); 403 | CHECK_ERROR(err); 404 | cl_kernel kernel1 = clCreateKernel(program, "winograd_2x2_3x3_32x32", &err); 405 | CHECK_ERROR(err); 406 | cl_kernel kernel2 = clCreateKernel(program, "CHWN2NCHW", &err); 407 | CHECK_ERROR(err); 408 | 409 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 410 | CHECK_ERROR(err); 411 | cl_mem inputs_CHWN_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 412 | CHECK_ERROR(err); 413 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 414 | CHECK_ERROR(err); 415 | cl_mem outputs_CHWN_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 416 | CHECK_ERROR(err); 417 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 418 | CHECK_ERROR(err); 419 | cl_mem filters_CHWN_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 420 | CHECK_ERROR(err); 421 | cl_mem bias_dev = clCreateBuffer(context, 0, sizeof(float) * (K), NULL, &err); 422 | CHECK_ERROR(err); 423 | 424 | timer_start(0); 425 | CHECK_ERROR(clEnqueueWriteBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 426 | CHECK_ERROR(clEnqueueWriteBuffer(queue, filters_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 427 | CHECK_ERROR(clEnqueueWriteBuffer(queue, bias_dev, CL_TRUE, 0, sizeof(float) * (K), bias, 0, NULL, NULL)); 428 | //timer_end(0, "wino WriteBuffer"); 429 | 430 | { 431 | timer_start(0); 432 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &inputs_dev)); 433 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &inputs_CHWN_dev)); 434 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &N)); 435 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &C)); 436 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &H)); 437 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &W)); 438 | size_t gws[1] = {_ceil(N * C * H * W, 256)}; 439 | size_t lws[1] = {256}; 440 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 441 | clFinish(queue); 442 | //timer_end(0, "wino input transform"); 443 | } 444 | 445 | { 446 | timer_start(0); 447 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &filters_dev)); 448 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &filters_CHWN_dev)); 449 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &K)); 450 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &C)); 451 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &R)); 452 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &S)); 453 | size_t gws[1] = {_ceil(K * C * R * S, 256)}; 454 | size_t lws[1] = {256}; 455 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 456 | clFinish(queue); 457 | //timer_end(0, "wino filter transform"); 458 | } 459 | 460 | { 461 | timer_start(0); 462 | int BN = 1, BK = 1; 463 | int TPmask, TPwidth, TPshift, TQmask, TQwidth, TQshift, Nmask, Nwidth; 464 | if (N < 2) 465 | TPmask = 0x18, TPwidth = 2, TPshift = 3, TQmask = 0x07, TQwidth = 3, TQshift = 0, Nmask = 0x00, Nwidth = 0; 466 | else if (N < 4) 467 | TPmask = 0x18, TPwidth = 2, TPshift = 3, TQmask = 0x06, TQwidth = 2, TQshift = 1, Nmask = 0x01, Nwidth = 1; 468 | else if (N < 8) 469 | TPmask = 0x10, TPwidth = 1, TPshift = 4, TQmask = 0x0c, TQwidth = 2, TQshift = 2, Nmask = 0x03, Nwidth = 2; 470 | else if (N < 16) 471 | TPmask = 0x10, TPwidth = 1, TPshift = 4, TQmask = 0x08, TQwidth = 1, TQshift = 3, Nmask = 0x07, Nwidth = 3; 472 | else if (N < 32) 473 | TPmask = 0x00, TPwidth = 0, TPshift = 0, TQmask = 0x10, TQwidth = 1, TQshift = 4, Nmask = 0x0f, Nwidth = 4; 474 | else 475 | TPmask = 0x00, TPwidth = 0, TPshift = 0, TQmask = 0x00, TQwidth = 0, TQshift = 0, Nmask = 0x1f, Nwidth = 5; 476 | int TP = _ceil_div(P, 2 << TPwidth), TQ = _ceil_div(Q, 2 << TQwidth); 477 | int TK = _ceil_div(K, 32), TN = _ceil_div(N, 1 << Nwidth); 478 | 479 | CHECK_ERROR(clSetKernelArg(kernel1, 0, sizeof(cl_mem), &inputs_CHWN_dev)); 480 | CHECK_ERROR(clSetKernelArg(kernel1, 1, sizeof(cl_mem), &outputs_CHWN_dev)); 481 | CHECK_ERROR(clSetKernelArg(kernel1, 2, sizeof(cl_mem), &filters_CHWN_dev)); 482 | CHECK_ERROR(clSetKernelArg(kernel1, 3, sizeof(cl_mem), &bias_dev)); 483 | CHECK_ERROR(clSetKernelArg(kernel1, 4, sizeof(int), &N)); 484 | CHECK_ERROR(clSetKernelArg(kernel1, 5, sizeof(int), &C)); 485 | CHECK_ERROR(clSetKernelArg(kernel1, 6, sizeof(int), &H)); 486 | CHECK_ERROR(clSetKernelArg(kernel1, 7, sizeof(int), &W)); 487 | CHECK_ERROR(clSetKernelArg(kernel1, 8, sizeof(int), &K)); 488 | CHECK_ERROR(clSetKernelArg(kernel1, 9, sizeof(int), &P)); 489 | CHECK_ERROR(clSetKernelArg(kernel1, 10, sizeof(int), &Q)); 490 | CHECK_ERROR(clSetKernelArg(kernel1, 11, sizeof(int), &pad)); 491 | CHECK_ERROR(clSetKernelArg(kernel1, 12, sizeof(int), &TP)); 492 | CHECK_ERROR(clSetKernelArg(kernel1, 13, sizeof(int), &TQ)); 493 | CHECK_ERROR(clSetKernelArg(kernel1, 14, sizeof(int), &BN)); 494 | CHECK_ERROR(clSetKernelArg(kernel1, 15, sizeof(int), &BK)); 495 | CHECK_ERROR(clSetKernelArg(kernel1, 16, sizeof(int), &TPmask)); 496 | CHECK_ERROR(clSetKernelArg(kernel1, 17, sizeof(int), &TPwidth)); 497 | CHECK_ERROR(clSetKernelArg(kernel1, 18, sizeof(int), &TPshift)); 498 | CHECK_ERROR(clSetKernelArg(kernel1, 19, sizeof(int), &TQmask)); 499 | CHECK_ERROR(clSetKernelArg(kernel1, 20, sizeof(int), &TQwidth)); 500 | CHECK_ERROR(clSetKernelArg(kernel1, 21, sizeof(int), &TQshift)); 501 | CHECK_ERROR(clSetKernelArg(kernel1, 22, sizeof(int), &Nmask)); 502 | CHECK_ERROR(clSetKernelArg(kernel1, 23, sizeof(int), &Nwidth)); 503 | size_t gws[3] = {TP * TQ * BN * BK * 256, TK / BK, TN / BN}; 504 | size_t lws[3] = {256, 1, 1}; 505 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel1, 3, NULL, gws, lws, 0, NULL, NULL)); 506 | clFinish(queue); 507 | timer_end(0, "wino"); 508 | } 509 | 510 | { 511 | timer_start(0); 512 | CHECK_ERROR(clSetKernelArg(kernel2, 0, sizeof(cl_mem), &outputs_CHWN_dev)); 513 | CHECK_ERROR(clSetKernelArg(kernel2, 1, sizeof(cl_mem), &outputs_dev)); 514 | CHECK_ERROR(clSetKernelArg(kernel2, 2, sizeof(int), &N)); 515 | CHECK_ERROR(clSetKernelArg(kernel2, 3, sizeof(int), &K)); 516 | CHECK_ERROR(clSetKernelArg(kernel2, 4, sizeof(int), &P)); 517 | CHECK_ERROR(clSetKernelArg(kernel2, 5, sizeof(int), &Q)); 518 | size_t gws[1] = {_ceil(N * K * P * Q, 256)}; 519 | size_t lws[1] = {256}; 520 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, gws, lws, 0, NULL, NULL)); 521 | clFinish(queue); 522 | //timer_end(0, "wino output transform"); 523 | } 524 | 525 | timer_start(0); 526 | CHECK_ERROR(clEnqueueReadBuffer(queue, outputs_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 527 | //timer_end(0, "wino ReadBuffer"); 528 | 529 | clReleaseMemObject(inputs_dev); 530 | clReleaseMemObject(inputs_CHWN_dev); 531 | clReleaseMemObject(outputs_dev); 532 | clReleaseMemObject(outputs_CHWN_dev); 533 | clReleaseMemObject(filters_dev); 534 | clReleaseMemObject(filters_CHWN_dev); 535 | clReleaseMemObject(bias_dev); 536 | 537 | clReleaseKernel(kernel0); 538 | clReleaseKernel(kernel1); 539 | clReleaseKernel(kernel2); 540 | } 541 | 542 | void convolution_current(float *inputs, float *outputs, float *filters, float *bias, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 543 | cl_kernel kernel = clCreateKernel(program, "CCF", &err); 544 | CHECK_ERROR(err); 545 | 546 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 547 | CHECK_ERROR(err); 548 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 549 | CHECK_ERROR(err); 550 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 551 | CHECK_ERROR(err); 552 | cl_mem bias_dev = clCreateBuffer(context, 0, sizeof(float) * (K), NULL, &err); 553 | CHECK_ERROR(err); 554 | 555 | timer_start(0); 556 | CHECK_ERROR(clEnqueueWriteBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 557 | CHECK_ERROR(clEnqueueWriteBuffer(queue, filters_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 558 | CHECK_ERROR(clEnqueueWriteBuffer(queue, bias_dev, CL_TRUE, 0, sizeof(float) * (K), bias, 0, NULL, NULL)); 559 | //timer_end(0, "current WriteBuffer"); 560 | 561 | { 562 | timer_start(0); 563 | CHECK_ERROR(clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputs_dev)); 564 | CHECK_ERROR(clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputs_dev)); 565 | CHECK_ERROR(clSetKernelArg(kernel, 2, sizeof(cl_mem), &filters_dev)); 566 | CHECK_ERROR(clSetKernelArg(kernel, 3, sizeof(cl_mem), &bias_dev)); 567 | CHECK_ERROR(clSetKernelArg(kernel, 4, sizeof(int), &Q)); 568 | CHECK_ERROR(clSetKernelArg(kernel, 5, sizeof(int), &P)); 569 | CHECK_ERROR(clSetKernelArg(kernel, 6, sizeof(int), &K)); 570 | CHECK_ERROR(clSetKernelArg(kernel, 7, sizeof(int), &W)); 571 | CHECK_ERROR(clSetKernelArg(kernel, 8, sizeof(int), &H)); 572 | CHECK_ERROR(clSetKernelArg(kernel, 9, sizeof(int), &C)); 573 | CHECK_ERROR(clSetKernelArg(kernel, 10, sizeof(int), &S)); 574 | CHECK_ERROR(clSetKernelArg(kernel, 11, sizeof(int), &R)); 575 | CHECK_ERROR(clSetKernelArg(kernel, 12, sizeof(int), &pad)); 576 | int stride = 1; 577 | CHECK_ERROR(clSetKernelArg(kernel, 13, sizeof(int), &stride)); 578 | size_t gws[3] = {P * Q, K, N}; 579 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, gws, NULL, 0, NULL, NULL)); 580 | clFinish(queue); 581 | timer_end(0, "current"); 582 | } 583 | 584 | timer_start(0); 585 | CHECK_ERROR(clEnqueueReadBuffer(queue, outputs_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 586 | //timer_end(0, "current ReadBuffer"); 587 | 588 | clReleaseMemObject(inputs_dev); 589 | clReleaseMemObject(outputs_dev); 590 | clReleaseMemObject(filters_dev); 591 | clReleaseMemObject(bias_dev); 592 | 593 | clReleaseKernel(kernel); 594 | } 595 | 596 | void convolution_wino_nonfused_fwd(float *inputs, float *outputs, float *filters, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 597 | cl_kernel kernel0 = clCreateKernel(program, "winograd_2x2_3x3_data_transform", &err); 598 | CHECK_ERROR(err); 599 | cl_kernel kernel1 = clCreateKernel(program, "winograd_2x2_3x3_filter_transform", &err); 600 | CHECK_ERROR(err); 601 | cl_kernel kernel2 = clCreateKernel(program, "winograd_2x2_3x3_inverse_transform", &err); 602 | CHECK_ERROR(err); 603 | 604 | int TP = _ceil_div(P, 2), TQ = _ceil_div(Q, 2); 605 | 606 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 607 | CHECK_ERROR(err); 608 | cl_mem inputs_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * C * N * TP * TQ), NULL, &err); 609 | CHECK_ERROR(err); 610 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 611 | CHECK_ERROR(err); 612 | cl_mem filters_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * K * C), NULL, &err); 613 | CHECK_ERROR(err); 614 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 615 | CHECK_ERROR(err); 616 | cl_mem outputs_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * K * N * TP * TQ), NULL, &err); 617 | CHECK_ERROR(err); 618 | 619 | CHECK_ERROR(clEnqueueWriteBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 620 | CHECK_ERROR(clEnqueueWriteBuffer(queue, filters_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 621 | 622 | timer_start(1); 623 | 624 | { 625 | timer_start(0); 626 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &inputs_dev)); 627 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &inputs_T_dev)); 628 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &N)); 629 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &C)); 630 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &H)); 631 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &W)); 632 | CHECK_ERROR(clSetKernelArg(kernel0, 6, sizeof(int), &pad)); 633 | CHECK_ERROR(clSetKernelArg(kernel0, 7, sizeof(int), &TP)); 634 | CHECK_ERROR(clSetKernelArg(kernel0, 8, sizeof(int), &TQ)); 635 | size_t gws[1] = {_ceil(N * C * TP * TQ, 256)}; 636 | size_t lws[1] = {256}; 637 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 638 | clFinish(queue); 639 | timer_end(0, "wino_nonfused data_transform"); 640 | } 641 | 642 | { 643 | timer_start(0); 644 | CHECK_ERROR(clSetKernelArg(kernel1, 0, sizeof(cl_mem), &filters_dev)); 645 | CHECK_ERROR(clSetKernelArg(kernel1, 1, sizeof(cl_mem), &filters_T_dev)); 646 | CHECK_ERROR(clSetKernelArg(kernel1, 2, sizeof(int), &K)); 647 | CHECK_ERROR(clSetKernelArg(kernel1, 3, sizeof(int), &C)); 648 | size_t gws[1] = {_ceil(K * C, 256)}; 649 | size_t lws[1] = {256}; 650 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, gws, lws, 0, NULL, NULL)); 651 | clFinish(queue); 652 | timer_end(0, "wino_nonfused filter_transform"); 653 | } 654 | 655 | { 656 | timer_start(0); 657 | for (int i = 0; i < 16; ++i) { 658 | cl_event event; 659 | err = clblasSgemm(clblasRowMajor, clblasNoTrans, clblasNoTrans, 660 | K, N * TP * TQ, C, 1, 661 | filters_T_dev, i * K * C, C, inputs_T_dev, i * C * N * TP * TQ, N * TP * TQ, 662 | 0, outputs_T_dev, i * K * N * TP * TQ, N * TP * TQ, 663 | 1, &queue, 0, NULL, &event); 664 | } 665 | clFinish(queue); 666 | timer_end(0, "wino_nonfused GEMM"); 667 | } 668 | 669 | { 670 | timer_start(0); 671 | CHECK_ERROR(clSetKernelArg(kernel2, 0, sizeof(cl_mem), &outputs_T_dev)); 672 | CHECK_ERROR(clSetKernelArg(kernel2, 1, sizeof(cl_mem), &outputs_dev)); 673 | CHECK_ERROR(clSetKernelArg(kernel2, 2, sizeof(int), &N)); 674 | CHECK_ERROR(clSetKernelArg(kernel2, 3, sizeof(int), &K)); 675 | CHECK_ERROR(clSetKernelArg(kernel2, 4, sizeof(int), &P)); 676 | CHECK_ERROR(clSetKernelArg(kernel2, 5, sizeof(int), &Q)); 677 | CHECK_ERROR(clSetKernelArg(kernel2, 6, sizeof(int), &TP)); 678 | CHECK_ERROR(clSetKernelArg(kernel2, 7, sizeof(int), &TQ)); 679 | size_t gws[1] = {_ceil(K * N * TP * TQ, 256)}; 680 | size_t lws[1] = {256}; 681 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, gws, lws, 0, NULL, NULL)); 682 | clFinish(queue); 683 | timer_end(0, "wino_nonfused inverse_transform"); 684 | } 685 | 686 | clFinish(queue); 687 | timer_end(1, "wino_nonfused"); 688 | 689 | CHECK_ERROR(clEnqueueReadBuffer(queue, outputs_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 690 | 691 | clReleaseMemObject(inputs_dev); 692 | clReleaseMemObject(inputs_T_dev); 693 | clReleaseMemObject(outputs_dev); 694 | clReleaseMemObject(outputs_T_dev); 695 | clReleaseMemObject(filters_dev); 696 | clReleaseMemObject(filters_T_dev); 697 | 698 | clReleaseKernel(kernel0); 699 | clReleaseKernel(kernel1); 700 | clReleaseKernel(kernel2); 701 | } 702 | 703 | void convolution_wino_nonfused_bwd_data(float *inputs, float *outputs, float *filters, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 704 | cl_kernel kernel0 = clCreateKernel(program, "winograd_2x2_3x3_data_transform", &err); 705 | CHECK_ERROR(err); 706 | cl_kernel kernel1 = clCreateKernel(program, "winograd_2x2_3x3_filter_transform", &err); 707 | CHECK_ERROR(err); 708 | cl_kernel kernel2 = clCreateKernel(program, "winograd_2x2_3x3_inverse_transform", &err); 709 | CHECK_ERROR(err); 710 | cl_kernel kernel3 = clCreateKernel(program, "flip_filter", &err); 711 | CHECK_ERROR(err); 712 | 713 | int TP = _ceil_div(H, 2), TQ = _ceil_div(W, 2); 714 | pad = R - 1 - pad; 715 | 716 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 717 | CHECK_ERROR(err); 718 | cl_mem inputs_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * C * N * TP * TQ), NULL, &err); 719 | CHECK_ERROR(err); 720 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 721 | CHECK_ERROR(err); 722 | cl_mem filters_F_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 723 | CHECK_ERROR(err); 724 | cl_mem filters_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * K * C), NULL, &err); 725 | CHECK_ERROR(err); 726 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 727 | CHECK_ERROR(err); 728 | cl_mem outputs_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * K * N * TP * TQ), NULL, &err); 729 | CHECK_ERROR(err); 730 | 731 | CHECK_ERROR(clEnqueueWriteBuffer(queue, outputs_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 732 | CHECK_ERROR(clEnqueueWriteBuffer(queue, filters_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 733 | 734 | timer_start(1); 735 | 736 | { 737 | timer_start(0); 738 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &outputs_dev)); 739 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &outputs_T_dev)); 740 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &N)); 741 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &K)); 742 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &P)); 743 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &Q)); 744 | CHECK_ERROR(clSetKernelArg(kernel0, 6, sizeof(int), &pad)); 745 | CHECK_ERROR(clSetKernelArg(kernel0, 7, sizeof(int), &TP)); 746 | CHECK_ERROR(clSetKernelArg(kernel0, 8, sizeof(int), &TQ)); 747 | size_t gws[1] = {_ceil(N * K * TP * TQ, 256)}; 748 | size_t lws[1] = {256}; 749 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 750 | clFinish(queue); 751 | timer_end(0, "wino_nonfused data_transform"); 752 | } 753 | 754 | { 755 | timer_start(0); 756 | CHECK_ERROR(clSetKernelArg(kernel3, 0, sizeof(cl_mem), &filters_dev)); 757 | CHECK_ERROR(clSetKernelArg(kernel3, 1, sizeof(cl_mem), &filters_F_dev)); 758 | CHECK_ERROR(clSetKernelArg(kernel3, 2, sizeof(int), &K)); 759 | CHECK_ERROR(clSetKernelArg(kernel3, 3, sizeof(int), &C)); 760 | CHECK_ERROR(clSetKernelArg(kernel3, 4, sizeof(int), &R)); 761 | CHECK_ERROR(clSetKernelArg(kernel3, 5, sizeof(int), &S)); 762 | size_t gws[1] = {_ceil(K * C * R * S, 256)}; 763 | size_t lws[1] = {256}; 764 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel3, 1, NULL, gws, lws, 0, NULL, NULL)); 765 | clFinish(queue); 766 | timer_end(0, "wino_nonfused flip_filter"); 767 | } 768 | 769 | { 770 | timer_start(0); 771 | CHECK_ERROR(clSetKernelArg(kernel1, 0, sizeof(cl_mem), &filters_F_dev)); 772 | CHECK_ERROR(clSetKernelArg(kernel1, 1, sizeof(cl_mem), &filters_T_dev)); 773 | CHECK_ERROR(clSetKernelArg(kernel1, 2, sizeof(int), &C)); 774 | CHECK_ERROR(clSetKernelArg(kernel1, 3, sizeof(int), &K)); 775 | size_t gws[1] = {_ceil(C * K, 256)}; 776 | size_t lws[1] = {256}; 777 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, gws, lws, 0, NULL, NULL)); 778 | clFinish(queue); 779 | timer_end(0, "wino_nonfused filter_transform"); 780 | } 781 | 782 | { 783 | timer_start(0); 784 | for (int i = 0; i < 16; ++i) { 785 | cl_event event; 786 | err = clblasSgemm(clblasRowMajor, clblasNoTrans, clblasNoTrans, 787 | C, N * TP * TQ, K, 1, 788 | filters_T_dev, i * C * K, K, outputs_T_dev, i * K * N * TP * TQ, N * TP * TQ, 789 | 0, inputs_T_dev, i * C * N * TP * TQ, N * TP * TQ, 790 | 1, &queue, 0, NULL, &event); 791 | } 792 | clFinish(queue); 793 | timer_end(0, "wino_nonfused GEMM"); 794 | } 795 | 796 | { 797 | timer_start(0); 798 | CHECK_ERROR(clSetKernelArg(kernel2, 0, sizeof(cl_mem), &inputs_T_dev)); 799 | CHECK_ERROR(clSetKernelArg(kernel2, 1, sizeof(cl_mem), &inputs_dev)); 800 | CHECK_ERROR(clSetKernelArg(kernel2, 2, sizeof(int), &N)); 801 | CHECK_ERROR(clSetKernelArg(kernel2, 3, sizeof(int), &C)); 802 | CHECK_ERROR(clSetKernelArg(kernel2, 4, sizeof(int), &H)); 803 | CHECK_ERROR(clSetKernelArg(kernel2, 5, sizeof(int), &W)); 804 | CHECK_ERROR(clSetKernelArg(kernel2, 6, sizeof(int), &TP)); 805 | CHECK_ERROR(clSetKernelArg(kernel2, 7, sizeof(int), &TQ)); 806 | size_t gws[1] = {_ceil(C * N * TP * TQ, 256)}; 807 | size_t lws[1] = {256}; 808 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, gws, lws, 0, NULL, NULL)); 809 | clFinish(queue); 810 | timer_end(0, "wino_nonfused inverse_transform"); 811 | } 812 | 813 | clFinish(queue); 814 | timer_end(1, "wino_nonfused"); 815 | 816 | CHECK_ERROR(clEnqueueReadBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 817 | 818 | clReleaseMemObject(inputs_dev); 819 | clReleaseMemObject(inputs_T_dev); 820 | clReleaseMemObject(outputs_dev); 821 | clReleaseMemObject(outputs_T_dev); 822 | clReleaseMemObject(filters_dev); 823 | clReleaseMemObject(filters_F_dev); 824 | clReleaseMemObject(filters_T_dev); 825 | 826 | clReleaseKernel(kernel0); 827 | clReleaseKernel(kernel1); 828 | clReleaseKernel(kernel2); 829 | clReleaseKernel(kernel3); 830 | } 831 | 832 | void convolution_wino_nonfused_bwd_filter(float *inputs, float *outputs, float *filters, int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 833 | cl_kernel kernel0 = clCreateKernel(program, "winograd_3x3_2x2_data_transform", &err); 834 | CHECK_ERROR(err); 835 | cl_kernel kernel1 = clCreateKernel(program, "winograd_3x3_2x2_filter_transform", &err); 836 | CHECK_ERROR(err); 837 | cl_kernel kernel2 = clCreateKernel(program, "winograd_3x3_2x2_inverse_transform", &err); 838 | CHECK_ERROR(err); 839 | 840 | int TP = _ceil_div(P, 2), TQ = _ceil_div(Q, 2); 841 | 842 | cl_mem inputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * C * H * W), NULL, &err); 843 | CHECK_ERROR(err); 844 | cl_mem inputs_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * C * N * TP * TQ), NULL, &err); 845 | CHECK_ERROR(err); 846 | cl_mem filters_dev = clCreateBuffer(context, 0, sizeof(float) * (K * C * R * S), NULL, &err); 847 | CHECK_ERROR(err); 848 | cl_mem filters_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * K * C), NULL, &err); 849 | CHECK_ERROR(err); 850 | cl_mem outputs_dev = clCreateBuffer(context, 0, sizeof(float) * (N * K * P * Q), NULL, &err); 851 | CHECK_ERROR(err); 852 | cl_mem outputs_T_dev = clCreateBuffer(context, 0, sizeof(float) * (16 * K * N * TP * TQ), NULL, &err); 853 | CHECK_ERROR(err); 854 | 855 | CHECK_ERROR(clEnqueueWriteBuffer(queue, inputs_dev, CL_TRUE, 0, sizeof(float) * (N * C * H * W), inputs, 0, NULL, NULL)); 856 | CHECK_ERROR(clEnqueueWriteBuffer(queue, outputs_dev, CL_TRUE, 0, sizeof(float) * (N * K * P * Q), outputs, 0, NULL, NULL)); 857 | 858 | timer_start(1); 859 | 860 | { 861 | timer_start(0); 862 | CHECK_ERROR(clSetKernelArg(kernel0, 0, sizeof(cl_mem), &inputs_dev)); 863 | CHECK_ERROR(clSetKernelArg(kernel0, 1, sizeof(cl_mem), &inputs_T_dev)); 864 | CHECK_ERROR(clSetKernelArg(kernel0, 2, sizeof(int), &N)); 865 | CHECK_ERROR(clSetKernelArg(kernel0, 3, sizeof(int), &C)); 866 | CHECK_ERROR(clSetKernelArg(kernel0, 4, sizeof(int), &H)); 867 | CHECK_ERROR(clSetKernelArg(kernel0, 5, sizeof(int), &W)); 868 | CHECK_ERROR(clSetKernelArg(kernel0, 6, sizeof(int), &pad)); 869 | CHECK_ERROR(clSetKernelArg(kernel0, 7, sizeof(int), &TP)); 870 | CHECK_ERROR(clSetKernelArg(kernel0, 8, sizeof(int), &TQ)); 871 | size_t gws[1] = {_ceil(N * C * TP * TQ, 256)}; 872 | size_t lws[1] = {256}; 873 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, gws, lws, 0, NULL, NULL)); 874 | clFinish(queue); 875 | timer_end(0, "wino_nonfused data_transform"); 876 | } 877 | 878 | { 879 | timer_start(0); 880 | CHECK_ERROR(clSetKernelArg(kernel1, 0, sizeof(cl_mem), &outputs_dev)); 881 | CHECK_ERROR(clSetKernelArg(kernel1, 1, sizeof(cl_mem), &outputs_T_dev)); 882 | CHECK_ERROR(clSetKernelArg(kernel1, 2, sizeof(int), &N)); 883 | CHECK_ERROR(clSetKernelArg(kernel1, 3, sizeof(int), &K)); 884 | CHECK_ERROR(clSetKernelArg(kernel1, 4, sizeof(int), &P)); 885 | CHECK_ERROR(clSetKernelArg(kernel1, 5, sizeof(int), &Q)); 886 | CHECK_ERROR(clSetKernelArg(kernel1, 6, sizeof(int), &TP)); 887 | CHECK_ERROR(clSetKernelArg(kernel1, 7, sizeof(int), &TQ)); 888 | size_t gws[1] = {_ceil(N * K * TP * TQ, 256)}; 889 | size_t lws[1] = {256}; 890 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, gws, lws, 0, NULL, NULL)); 891 | clFinish(queue); 892 | timer_end(0, "wino_nonfused filter_transform"); 893 | } 894 | 895 | { 896 | timer_start(0); 897 | for (int i = 0; i < 16; ++i) { 898 | cl_event event; 899 | CHECK_ERROR(clblasSgemm(clblasRowMajor, clblasNoTrans, clblasTrans, 900 | K, C, N * TP * TQ, 1, 901 | outputs_T_dev, i * K * N * TP * TQ, N * TP * TQ, inputs_T_dev, i * C * N * TP * TQ, N * TP * TQ, 902 | 0, filters_T_dev, i * K * C, C, 903 | 1, &queue, 0, NULL, &event)); 904 | } 905 | clFinish(queue); 906 | timer_end(0, "wino_nonfused GEMM"); 907 | } 908 | 909 | { 910 | timer_start(0); 911 | CHECK_ERROR(clSetKernelArg(kernel2, 0, sizeof(cl_mem), &filters_T_dev)); 912 | CHECK_ERROR(clSetKernelArg(kernel2, 1, sizeof(cl_mem), &filters_dev)); 913 | CHECK_ERROR(clSetKernelArg(kernel2, 2, sizeof(int), &K)); 914 | CHECK_ERROR(clSetKernelArg(kernel2, 3, sizeof(int), &C)); 915 | size_t gws[1] = {_ceil(K * C, 256)}; 916 | size_t lws[1] = {256}; 917 | CHECK_ERROR(clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, gws, lws, 0, NULL, NULL)); 918 | clFinish(queue); 919 | timer_end(0, "wino_nonfused inverse_transform"); 920 | } 921 | 922 | clFinish(queue); 923 | timer_end(1, "wino_nonfused"); 924 | 925 | CHECK_ERROR(clEnqueueReadBuffer(queue, filters_dev, CL_TRUE, 0, sizeof(float) * (K * C * R * S), filters, 0, NULL, NULL)); 926 | 927 | clReleaseMemObject(inputs_dev); 928 | clReleaseMemObject(inputs_T_dev); 929 | clReleaseMemObject(outputs_dev); 930 | clReleaseMemObject(outputs_T_dev); 931 | clReleaseMemObject(filters_dev); 932 | clReleaseMemObject(filters_T_dev); 933 | 934 | clReleaseKernel(kernel0); 935 | clReleaseKernel(kernel1); 936 | clReleaseKernel(kernel2); 937 | } 938 | 939 | void validate(int N, int C, int H, int W, int K, int P, int Q, int R, int S, int pad, cl_context context, cl_command_queue queue, cl_program program) { 940 | float *inputs = (float*)malloc(sizeof(float) * (N * C * H * W)); 941 | float *filters = (float*)malloc(sizeof(float) * (K * C * R * S)); 942 | float *bias = (float*)malloc(sizeof(float) * (K)); 943 | fillData(inputs, N * C * H * W); 944 | fillData(filters, K * C * R * S); 945 | fillData(bias, K); 946 | //printData(inputs, N, C, H, W, "inputs"); 947 | //printData(filters, K, C, R, S, "filters"); 948 | //printData(bias, 1, 1, 1, K, "bias"); 949 | 950 | float *outputs_cpu = (float*)malloc(sizeof(float) * (N * K * P * Q)); 951 | float *outputs_wino32 = (float*)malloc(sizeof(float) * (N * K * P * Q)); 952 | float *outputs_wino16 = (float*)malloc(sizeof(float) * (N * K * P * Q)); 953 | float *outputs_current = (float*)malloc(sizeof(float) * (N * K * P * Q)); 954 | float *outputs_mc = (float*)malloc(sizeof(float) * (N * K * P * Q)); 955 | float *outputs_wino_nonfused = (float*)malloc(sizeof(float) * (N * K * P * Q)); 956 | 957 | float *dx_cpu = (float*)malloc(sizeof(float) * (N * C * H * W)); 958 | float *dx_wino_nonfused = (float*)malloc(sizeof(float) * (N * C * H * W)); 959 | 960 | float *dw_cpu = (float*)malloc(sizeof(float) * (K * C * R * S)); 961 | float *dw_wino_nonfused = (float*)malloc(sizeof(float) * (K * C * R * S)); 962 | 963 | //convolution_cpu(inputs, outputs_cpu, filters, dx_cpu, dw_cpu, N, C, H, W, K, P, Q, R, S, pad); 964 | for (int i = 0; i < 4; ++i) { 965 | //convolution_current(inputs, outputs_current, filters, bias, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 966 | //convolution_wino32(inputs, outputs_wino32, filters, bias, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 967 | //convolution_wino16(inputs, outputs_wino16, filters, bias, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 968 | //convolution_mc(inputs, outputs_mc, filters, bias, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 969 | convolution_wino_nonfused_fwd(inputs, outputs_wino_nonfused, filters, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 970 | convolution_wino_nonfused_bwd_data(dx_wino_nonfused, outputs_wino_nonfused, filters, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 971 | convolution_wino_nonfused_bwd_filter(inputs, outputs_wino_nonfused, dw_wino_nonfused, N, C, H, W, K, P, Q, R, S, pad, context, queue, program); 972 | } 973 | //printData(outputs_current, N, K, P, Q, "outputs_current"); 974 | //printf("!!!!! WINO VALIDATION %s !!!!!\n", equalData(outputs_cpu, outputs_wino, N, K, P, Q) ? "SUCCESS" : "FAIL"); 975 | //printf("!!!!! CURRENT VALIDATION %s !!!!!\n", equalData(outputs_cpu, outputs_current, N, K, P, Q) ? "SUCCESS" : "FAIL"); 976 | //printf("!!!!! WINO32 == CURRENT VALIDATION %s !!!!!\n", equalData(outputs_wino32, outputs_current, N, K, P, Q) ? "SUCCESS" : "FAIL"); 977 | //printf("!!!!! WINO16 == CURRENT VALIDATION %s !!!!!\n", equalData(outputs_wino16, outputs_current, N, K, P, Q) ? "SUCCESS" : "FAIL"); 978 | //printf("!!!!! MC == CURRENT VALIDATION %s !!!!!\n", equalData(outputs_mc, outputs_current, N, K, P, Q) ? "SUCCESS" : "FAIL"); 979 | //printf("!!!!! WINO_NONFUSED == CURRENT VALIDATION %s !!!!!\n", equalData(outputs_wino_nonfused, outputs_current, N, K, P, Q) ? "SUCCESS" : "FAIL"); 980 | printf("!!!!! cpu == wino_nonfused VALIDATION FWD %s !!!!!\n", equalData(outputs_cpu, outputs_wino_nonfused, N, K, P, Q) ? "SUCCESS" : "FAIL"); 981 | printf("!!!!! cpu == wino_nonfused VALIDATION BWD DATA %s !!!!!\n", equalData(dx_cpu, dx_wino_nonfused, N, C, H, W) ? "SUCCESS" : "FAIL"); 982 | printf("!!!!! cpu == wino_nonfused VALIDATION BWD FILTER %s !!!!!\n", equalData(dw_cpu, dw_wino_nonfused, K, C, R, S) ? "SUCCESS" : "FAIL"); 983 | 984 | free(inputs); 985 | free(filters); 986 | free(bias); 987 | free(outputs_cpu); 988 | free(outputs_wino32); 989 | free(outputs_wino16); 990 | free(outputs_current); 991 | free(outputs_mc); 992 | free(outputs_wino_nonfused); 993 | free(dx_cpu); 994 | free(dx_wino_nonfused); 995 | free(dw_cpu); 996 | free(dw_wino_nonfused); 997 | } 998 | 999 | int main() { 1000 | //srand(time(NULL)); 1001 | 1002 | cl_platform_id platform; 1003 | err = clGetPlatformIDs(1, &platform, NULL); 1004 | CHECK_ERROR(err); 1005 | 1006 | cl_device_id device; 1007 | err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); 1008 | CHECK_ERROR(err); 1009 | 1010 | cl_context context; 1011 | context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); 1012 | CHECK_ERROR(err); 1013 | 1014 | cl_command_queue queue; 1015 | queue = clCreateCommandQueue(context, device, 0, &err); 1016 | CHECK_ERROR(err); 1017 | 1018 | CHECK_ERROR(clblasSetup()); 1019 | 1020 | cl_program program = create_and_build_program(context, device, "kernel.cl"); 1021 | 1022 | //validate(1, 1, 4, 4, 1, 2, 2, 3, 3, 0, context, queue, program); 1023 | //validate(1, 1, 8, 8, 1, 6, 6, 3, 3, 0, context, queue, program); 1024 | //validate(1, 1, 3, 3, 1, 1, 1, 3, 3, 0, context, queue, program); 1025 | //validate(1, 1, 15, 15, 1, 13, 13, 3, 3, 0, context, queue, program); 1026 | //validate(33, 63, 17, 17, 63, 17, 17, 3, 3, 1, context, queue, program); 1027 | //validate(1, 3, 224, 224, 64, 224, 224, 3, 3, 1, context, queue, program); 1028 | //validate(32, 256, 56, 56, 256, 56, 56, 3, 3, 1, context, queue, program); 1029 | //validate(32, 512, 28, 28, 512, 28, 28, 3, 3, 1, context, queue, program); 1030 | //validate(22, 22, 22, 22, 22, 22, 22, 3, 3, 1, context, queue, program); // crazy gemm time 1031 | 1032 | clblasTeardown(); 1033 | 1034 | clReleaseProgram(program); 1035 | clReleaseCommandQueue(queue); 1036 | clReleaseContext(context); 1037 | return 0; 1038 | } 1039 | -------------------------------------------------------------------------------- /timer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "timer.h" 3 | 4 | static struct timespec t[8]; 5 | 6 | static int timespec_subtract(struct timespec* result, struct timespec *x, struct timespec *y) { 7 | if (x->tv_nsec < y->tv_nsec) { 8 | int nsec = (y->tv_nsec - x->tv_nsec) / 1000000000 + 1; 9 | y->tv_nsec -= 1000000000 * nsec; 10 | y->tv_sec += nsec; 11 | } 12 | if (x->tv_nsec - y->tv_nsec > 1000000000) { 13 | int nsec = (x->tv_nsec - y->tv_nsec) / 1000000000; 14 | y->tv_nsec += 1000000000 * nsec; 15 | y->tv_sec -= nsec; 16 | } 17 | result->tv_sec = x->tv_sec - y->tv_sec; 18 | result->tv_nsec = x->tv_nsec - y->tv_nsec; 19 | return x->tv_sec < y->tv_sec; 20 | } 21 | 22 | void timer_start(int id) { 23 | clock_gettime(CLOCK_MONOTONIC, &t[id]); 24 | } 25 | 26 | double timer_end(int id, const char *s) { 27 | struct timespec x, y; 28 | clock_gettime(CLOCK_MONOTONIC, &x); 29 | timespec_subtract(&y, &x, &t[id]); 30 | double elapsed = y.tv_sec * 1e3 + y.tv_nsec / 1e6; 31 | printf("[%s] %f ms\n", s, elapsed); 32 | return elapsed; 33 | } 34 | -------------------------------------------------------------------------------- /timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | 6 | void timer_start(int id); 7 | double timer_end(int id, const char *s); 8 | 9 | #endif 10 | --------------------------------------------------------------------------------