├── readme.md ├── vanilla_conv_cpu └── vanila_convolution.cpp ├── xnor_cpu └── xnor_cpu_final.cpp └── xnor_gpu └── xnor_gpu_v2.cu /readme.md: -------------------------------------------------------------------------------- 1 | This repository contains companion code for the following preprint: 2 | 3 | M.C. Kaya, A. İnci, A. Temizel, "Optimization of XNOR Convolution for Binary 4 | Convolutional Neural Networks on GPU", arXiv:2007.14178, July 2020. 5 | 6 | If you use this code please cite the paper using the bibtex reference below: 7 | 8 | [[Paper]](https://arxiv.org/abs/2007.14178) 9 | 10 | Authors: 11 | Mete Can KAYA, 12 | Alperen İnci and 13 | [Alptekin Temizel](https://scholar.google.com.tr/citations?user=3grTeasAAAAJ&hl=en) 14 | Affiliation: METU 15 | 16 | 17 | ## Versions 18 | 19 | Cuda 10.1, C++11, Python 3.6.7 and Tensorflow v1.13.1. 20 | 21 | ## Citation 22 | @misc{kaya2020optimization, 23 | title={Optimization of XNOR Convolution for Binary Convolutional Neural 24 | Networks on GPU}, 25 | author={Mete Can Kaya and Alperen İnci and Alptekin Temizel}, 26 | year={2020}, 27 | eprint={2007.14178}, 28 | archivePrefix={arXiv}, 29 | primaryClass={cs.CV} 30 | } 31 | -------------------------------------------------------------------------------- /vanilla_conv_cpu/vanila_convolution.cpp: -------------------------------------------------------------------------------- 1 | #include "vector" 2 | #include "chrono" 3 | #include "iostream" 4 | #include 5 | 6 | 7 | template 8 | using weight_matrices = std::vector>>>>; 9 | 10 | template 11 | using matrix_4d = std::vector>>>; 12 | 13 | template 14 | using matrix_3d = std::vector>>; 15 | 16 | template 17 | using matrix_2d = std::vector> ; 18 | 19 | template 20 | using matrix_1d = std::vector; 21 | 22 | template 23 | std::pair get_matrix_shape(matrix_2d matrix){ 24 | 25 | int height = matrix.size(); 26 | int width = matrix[0].size(); 27 | return std::make_pair(height,width); 28 | } 29 | 30 | static double total_time = 0; 31 | 32 | template 33 | void zero_padding_2D(matrix_2d &input_mat, matrix_2d &output_mat, std::pair input_size, std::pair kernel_size) 34 | { 35 | 36 | for(int i=0; i < input_size.first; i++) 37 | { 38 | for(int j=0; j < input_size.second; j++) 39 | { 40 | output_mat[i + (kernel_size.first - 1)/2][j + (kernel_size.second - 1)/2] = input_mat[i][j]; 41 | } 42 | } 43 | } 44 | 45 | 46 | template 47 | void zero_initialize_2D(matrix_2d &output_mat, std::pair output_size) 48 | { 49 | matrix_1d dummy; 50 | for(int i=0; i < output_size.first; i++) 51 | { 52 | 53 | for(int j=0; j < output_size.second; j++) 54 | { 55 | output_mat[i][j] = 0; 56 | } 57 | 58 | } 59 | } 60 | 61 | 62 | template 63 | void sum(matrix_2d &out, matrix_2d in) 64 | { 65 | std::pair out_size = get_matrix_shape(out); 66 | //std::cout << in.size() << std::endl; 67 | //std::cout << in[0].size() << std::endl; 68 | for (int i = 0; i < out_size.first; i++) 69 | { 70 | // in channel 71 | for (int j = 0; j < out_size.second; j++) 72 | { 73 | out[i][j] += in[i][j]; 74 | } 75 | } 76 | 77 | } 78 | 79 | 80 | template 81 | matrix_2d conv_op(matrix_2d input_matrix, matrix_2d kernel_matrix) 82 | { 83 | 84 | std::pair kernel_size = get_matrix_shape(kernel_matrix); 85 | std::pair input_size = get_matrix_shape(input_matrix); 86 | 87 | matrix_2d output_matrix(input_size.first, matrix_1d (input_size.second,0)); 88 | zero_initialize_2D(output_matrix,input_size); 89 | 90 | //matrix_2d padded_matrix(input_size.first + kernel_size.first -1, input_size.second + kernel_size.second -1); 91 | 92 | matrix_2d padded_matrix(input_size.first + kernel_size.first -1, matrix_1d(input_size.second + kernel_size.second -1,0)); 93 | zero_padding_2D(input_matrix, padded_matrix, input_size, kernel_size); 94 | std::pair padded_size = get_matrix_shape(padded_matrix); 95 | 96 | 97 | int i; 98 | auto start = std::chrono::high_resolution_clock::now(); 99 | #pragma omp parallel private(i) shared(padded_matrix, kernel_matrix, output_matrix) 100 | { 101 | #pragma omp for schedule(dynamic,50) collapse(1) 102 | 103 | for (i = 0; i < input_size.first; i++) 104 | { 105 | for (int j = 0; j < input_size.second; j++) 106 | { 107 | for (int k = 0; k < kernel_size.first; k++) 108 | { 109 | for (int z = 0; z < kernel_size.second; z++) 110 | { 111 | output_matrix[i][j] += padded_matrix[i+k][j+z] * kernel_matrix[k][z]; 112 | } 113 | } 114 | } 115 | 116 | } 117 | } 118 | auto stop = std::chrono::high_resolution_clock::now(); 119 | std::chrono::duration multi_core(stop - start); 120 | //std::cout<<"Time spend for Convolution :"<< multi_core.count()<< std::endl; 121 | total_time += static_cast(multi_core.count()); 122 | return output_matrix; 123 | 124 | } 125 | 126 | 127 | template 128 | matrix_3d conv2D(matrix_3d input_matrix, matrix_4d weight_matrix,int row, int col){ 129 | 130 | std::pair channel_dims = get_matrix_shape(weight_matrix); 131 | std::pair kernel_dims = get_matrix_shape(weight_matrix[0][0]); 132 | matrix_3d output_matrix; 133 | 134 | // out channel 135 | for (int i = 0; i < channel_dims.first; i++) 136 | { 137 | // in channel 138 | 139 | matrix_2d out(row, matrix_1d(col,0)); 140 | zero_initialize_2D(out, kernel_dims); 141 | for (int j = 0; j < channel_dims.second; j++) 142 | { 143 | sum (out,conv_op(input_matrix[j],weight_matrix[i][j])); 144 | } 145 | 146 | output_matrix.push_back(out); 147 | out.clear(); 148 | } 149 | return output_matrix; 150 | } 151 | 152 | 153 | int main() 154 | { 155 | int row[] = {128, 256, 512, 1024, 2048}; 156 | int col[] = {128, 256, 512, 1024, 2048}; 157 | int kernel = 3; 158 | int channel_in = 1; 159 | int channel_out = 1; 160 | int total_test_count = 100; 161 | for(int i=0; i<5; ++i) 162 | { 163 | for (int j=0; j input_matrix(channel_in, matrix_2d(row[i], std::vector(col[i], 0) ) ); 167 | matrix_4d weight_matrix(channel_out, matrix_3d(channel_in, std::vector>(kernel, std::vector(kernel, 0) ) ) ) ; 168 | auto output = conv2D(input_matrix, weight_matrix, row[i], col[i]); 169 | } 170 | total_time = total_time / static_cast(total_test_count); 171 | std::cout<< "Averaged Time "< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // Indexs are defined with x,y coordinate pairs 13 | constexpr std::pair register_size(8, 8); 14 | 15 | static double total_time = 0; 16 | 17 | 18 | // Custom Matrix class using standard vector. 19 | // #TODO Add padding 20 | // #TODO add type conversion function for matrix and tensor 21 | template 22 | class Matrix { 23 | 24 | typedef std::vector Row; 25 | 26 | 27 | public: 28 | std::vector data; 29 | 30 | int col, row; 31 | Matrix(int c, int r): row(r), col(c), data(c, std::vector(r, 0)){} 32 | Matrix(int c, int r, std::vector data): row(r), col(c), data(data){} 33 | 34 | 35 | Row & operator[](int i) 36 | { 37 | return data[i]; 38 | } 39 | Matrix operator*(T scalar) 40 | { 41 | Matrix mat(col, row); 42 | for (int j=0; col>j; j++) 43 | { 44 | for (int i=0; row>i; i++) 45 | { 46 | mat[j][i] = this->data[j][i] * scalar; 47 | 48 | } 49 | } 50 | return mat; 51 | 52 | } 53 | Matrix &operator*=(T scalar) 54 | { 55 | for (int j=0; col>j; j++) 56 | { 57 | for (int i=0; row>i; i++) 58 | { 59 | this->data[j][i] = this->data[j][i] * scalar; 60 | 61 | } 62 | } 63 | } 64 | Matrix operator*(Matrix matrix) 65 | { 66 | Matrix mat(col, row); 67 | for (int j=0; col>j; j++) 68 | { 69 | for (int i=0; row>i; i++) 70 | { 71 | mat[j][i] = this->data[j][i] * matrix[j][i]; 72 | 73 | } 74 | } 75 | return mat; 76 | 77 | } 78 | Matrix &operator*=(Matrix matrix) 79 | { 80 | for (int j=0; col>j; j++) 81 | { 82 | for (int i=0; row>i; i++) 83 | { 84 | this->data[j][i] = this->data[j][i] * matrix[j][i]; 85 | 86 | } 87 | } 88 | } 89 | size_t size() 90 | { 91 | return this->data.size(); 92 | } 93 | }; 94 | 95 | 96 | 97 | 98 | // Custom 3D tensor class using custom Matrix class. 99 | // Scalar multiplication and element-vise multiplication using matrix class 100 | // Hence using Openmp for multi threading with independent matrix objects 101 | template 102 | class Tensor{ 103 | typedef Matrix Mat; 104 | 105 | 106 | public: 107 | std::vector tensor; 108 | int col, row, channel; 109 | Tensor(int col, int row, int channel): col(col), row(row), channel(channel), tensor(channel, Matrix(col, row)) {} 110 | 111 | Tensor(int col, int row, int channel, std::vector tensor): col(col), row(row), channel(channel), tensor(tensor){} 112 | 113 | 114 | Mat & operator[](int c) 115 | { 116 | return tensor[c]; 117 | } 118 | Tensor operator*(T scalar) 119 | { 120 | Tensor tensor(col, row, channel); 121 | int k; 122 | #pragma omp parallel private(k) shared(scalar, tensor) 123 | { 124 | #pragma omp for schedule(dynamic, 50) 125 | for (k=0; channel>k; k++) 126 | { 127 | tensor[k] = this->tensor[k] * scalar; 128 | } 129 | } 130 | return tensor; 131 | 132 | 133 | 134 | } 135 | Tensor &operator*=(T scalar) 136 | { 137 | int k; 138 | #pragma omp parallel private(k) shared(scalar) 139 | { 140 | #pragma omp for schedule(dynamic,50) 141 | for (int k=0; channel>k; k++) 142 | { 143 | this->tensor[k] = this->tensor[k] * scalar; 144 | } 145 | } 146 | } 147 | 148 | Tensor &operator*=(std::vector row) 149 | { 150 | int k; 151 | #pragma omp parallel private(k) shared(row) 152 | { 153 | #pragma omp for schedule(dynamic,50) 154 | for (int k=0; channel>k; k++) 155 | { 156 | this->tensor[k] = this->tensor[k] * row[k]; 157 | } 158 | } 159 | } 160 | 161 | Tensor &operator*=(Matrix mat) 162 | { 163 | int k; 164 | #pragma omp parallel private(k) shared(mat) 165 | { 166 | #pragma omp for schedule(dynamic,50) 167 | for (int k=0; channel>k; k++) 168 | { 169 | this->tensor[k] = this->tensor[k] * mat; 170 | } 171 | } 172 | } 173 | 174 | Tensor &operator*=(Tensor tensor) 175 | { 176 | 177 | int k; 178 | #pragma omp parallel private(k) shared(tensor) 179 | { 180 | #pragma omp for schedule(dynamic,50) 181 | for (int k=0; channel>k; k++) 182 | { 183 | this->tensor[k] = this->tensor[k] * tensor[k]; 184 | } 185 | 186 | } 187 | } 188 | Tensor operator*(Tensor tensor) 189 | { 190 | Tensor ten(col, row, channel); 191 | int k; 192 | #pragma omp parallel private(k) shared(tensor) 193 | { 194 | #pragma omp for schedule(dynamic,50) 195 | for (int k=0; channel>k; k++) 196 | { 197 | ten[k] = this->tensor[k] * tensor[k]; 198 | } 199 | 200 | } 201 | return ten; 202 | } 203 | 204 | size_t size() 205 | { 206 | return this->tensor.size(); 207 | } 208 | }; 209 | 210 | template 211 | class Weight{ 212 | typedef Tensor tensor; 213 | 214 | public: 215 | std::vector weight; 216 | int col, row, channel_in, channel_out; 217 | 218 | Weight(int col, int row, int channel_in, int channel_out): col(col), row(row), channel_in(channel_in), 219 | channel_out(channel_out), weight(channel_out, Tensor(col, row, channel_in)) {} 220 | 221 | Weight(int col, int row, int channel_in, int channel_out, std::vector weight): col(col), row(row), channel_in(channel_in), 222 | channel_out(channel_out), weight( weight) {} 223 | 224 | tensor & operator[](int c) 225 | { 226 | return weight[c]; 227 | } 228 | size_t size() 229 | { 230 | return this->weight.size(); 231 | } 232 | 233 | }; 234 | 235 | void countSetBits(int x, int &y) 236 | { 237 | 238 | while (x) { 239 | y += x & 1; 240 | x >>= 1; 241 | } 242 | } 243 | 244 | 245 | 246 | void recursive_hash_map(std::vector &key_vector, std::unordered_map &hash_map, 247 | int &iteration_count, int count_index, const std::pairkernel_size) 248 | { 249 | for (int i=0; i < iteration_count; i++) 250 | { 251 | key_vector[count_index] = i; 252 | if (count_index == key_vector.size() - 1) 253 | { 254 | int key_value = 0; 255 | int bit_count = 0; 256 | for(int j=0; j generate_hash_map(std::pair &kernel_size) 273 | { 274 | std::unordered_map hash_map; 275 | std::vector key_vector(kernel_size.second); 276 | int iteration_count = std::pow(2, kernel_size.first); 277 | int count_index = 0; 278 | recursive_hash_map(key_vector, hash_map, iteration_count, count_index, kernel_size); 279 | return hash_map; 280 | } 281 | 282 | template 283 | Matrix BinaryMatMemoryAllocation( std::pair input_size, std::pair kernel_size) 284 | { 285 | int size_x = ceil((input_size.first - register_size.first) 286 | /static_cast(register_size.first + 1 - kernel_size.first) + 1); 287 | int size_y = ceil((input_size.second - register_size.second ) 288 | /static_cast(register_size.second + 1 - kernel_size.second) + 1); 289 | if (size_x < 0) 290 | size_x = 1; 291 | if (size_y < 0) 292 | size_y = 1; 293 | 294 | Matrix binary_mat(size_y, size_x); 295 | return binary_mat; 296 | } 297 | 298 | 299 | template 300 | void int2binary(const std::vector input_x, const std::pair input_index, 301 | std::pair output_location, unsigned long int &output_y, const std::pairregister_size) 302 | { 303 | int sign = 0; 304 | long int pozitive = 1; 305 | long int negative = 0; 306 | int count = output_location.second * register_size.second + output_location.first; 307 | 308 | assert(count < register_size.second * register_size.first); 309 | 310 | for(int i=0; i 0) - (input_x[input_index.first + i] < 0); 313 | if (sign == 1) 314 | { 315 | output_y = pozitive<= input_x.size()) 326 | { 327 | break; 328 | } 329 | count++; 330 | 331 | } 332 | 333 | } 334 | 335 | 336 | template 337 | void intMat2BinaryMat(Matrix &input_mat, Matrix &binary_mat, std::pair &kernel_size) 338 | { 339 | 340 | int index_x = 0; 341 | int index_y = 0; 342 | std::pair input_index(0, 0); 343 | std::pair output_location(0, 0); 344 | 345 | // Test 346 | while(input_mat.size() >= input_index.second) 347 | { 348 | std::vector input_row = input_mat[input_index.second]; 349 | int i = 0; 350 | input_index.first = 0; 351 | index_x = 0; 352 | 353 | while(input_row.size() > i) 354 | { 355 | i = input_index.first + register_size.first; 356 | int2binary(input_row, input_index, output_location, binary_mat[index_y][index_x], register_size); 357 | input_index.first = input_index.first + register_size.first + 1 - kernel_size.first; 358 | index_x++; 359 | 360 | } 361 | output_location.second++; 362 | input_index.second++; 363 | if(input_index.second >= input_mat.size()) 364 | { 365 | break; 366 | } 367 | if (output_location.second % register_size.second == 0) 368 | { 369 | output_location.second = 0; 370 | input_index.second = input_index.second + 1 - kernel_size.second; 371 | index_y++; 372 | } 373 | } 374 | } 375 | template 376 | Matrix tensorChannelSum(Tensor &input_tensor) 377 | { 378 | Matrix output_mat(input_tensor.col, input_tensor.row); 379 | for(int k=0; input_tensor.row > k; k++) 380 | { 381 | for(int j=0; input_tensor.col>j; j++) 382 | { 383 | int sum = 0; 384 | for (int i=0; input_tensor.channel>i; i++) 385 | { 386 | sum += input_tensor[i][k][j]; 387 | } 388 | output_mat[k][j] = static_cast(sum) / input_tensor.channel; 389 | } 390 | } 391 | return output_mat; 392 | } 393 | 394 | template 395 | void tensorChannelSum(Tensor &input_tensor, Matrix &output_matrix) 396 | { 397 | for(int k=0; input_tensor.row > k; k++) 398 | { 399 | for(int j=0; input_tensor.col>j; j++) 400 | { 401 | int sum = 0; 402 | for (int i=0; input_tensor.channel>i; i++) 403 | { 404 | sum += input_tensor[i][k][j]; 405 | } 406 | output_matrix[k][j] = static_cast(sum) / input_tensor.channel; 407 | } 408 | } 409 | } 410 | 411 | void cellConv2D(unsigned long input_mat, unsigned long conv_kernel, const unsigned long mask, 412 | std::pair conv_iter, std::pair output_index, std::pair image_size, 413 | Matrix &output_mat) 414 | { 415 | const std::pair input_index(0, 0); 416 | long int shifter = 0; 417 | 418 | int sign = 0; 419 | int index_x = 0; 420 | 421 | // iteration parameters for convolution kernel 422 | // X axis calculation 423 | if ( output_index.first + register_size.first < image_size.first) 424 | { 425 | conv_iter.first = conv_iter.first; 426 | } 427 | else 428 | { 429 | conv_iter.first = conv_iter.first - (register_size.first + output_index.first - image_size.first); 430 | } 431 | // Y axis calculation 432 | if ((output_index.second + register_size.second) < image_size.second) 433 | { 434 | conv_iter.second = conv_iter.second; 435 | } 436 | else 437 | { 438 | conv_iter.second = conv_iter.second - (register_size.second + output_index.second - image_size.second); 439 | } 440 | unsigned long int shift = 0; 441 | for (int j=0; conv_iter.second > j; j++) 442 | { 443 | for (int i=0; conv_iter.first > i; i++) 444 | { 445 | // Convolution operation here 446 | shifter = i + j * register_size.first; 447 | output_mat[output_index.second + j][output_index.first + i] = (input_mat | (conv_kernel>>shifter))&mask; // This is wrong, input matrix must be shifted. 448 | } 449 | } 450 | 451 | } 452 | 453 | 454 | 455 | void binaryConv2D(Matrix input_mat, Matrix &output_mat, 456 | unsigned long conv_kernel, std::pair conv_size, 457 | std::pair image_size) 458 | { 459 | 460 | unsigned long int mask = 0; 461 | mask = std::pow(2, conv_size.first) - 1; 462 | for (int j=1; conv_size.second > j ; j++) 463 | { 464 | mask = (mask<(std::pow(2, conv_size.first) - 1); 465 | } 466 | // mask = 1110000011100000111 = 2^3 -1 - 2^8 + 2^11 - 2^ 467 | 468 | const int conv_per_row = register_size.first - (conv_size.first - 1); 469 | const int conv_per_column = register_size.second - (conv_size.second - 1); 470 | std::pair conv_iter = std::make_pair(conv_per_row, conv_per_column); 471 | std::pair output_index(0, 0); 472 | for(int j=0; input_mat.size()>j; j++) 473 | { 474 | output_index.first = 0; 475 | for(int i=0; input_mat[0].size()>i; i++) 476 | { 477 | cellConv2D(input_mat[j][i], conv_kernel, mask, 478 | conv_iter, output_index, image_size, 479 | output_mat); 480 | output_index.first += conv_per_row; 481 | } 482 | output_index.second += conv_per_column; 483 | 484 | } 485 | } 486 | template 487 | void conv2D(Matrix &input_x, Matrix &output_y, std::pairkernel_size) 488 | { 489 | std::vector> scaled_x(input_x.size(), std::vector(input_x[0].size(), 0)); 490 | 491 | float k = 1.0 / (input_x.size() * input_x[0].size()); 492 | 493 | for (int j=0; input_x.size()>j; j++) 494 | { 495 | for (int i=0; input_x[0].size()>i; i++) 496 | { 497 | scaled_x[j][i] = input_x[j][i] * k; 498 | for(int y=(1 - kernel_size.second)/2 ; (kernel_size.second + 1)/2 > y; y++) 499 | { 500 | if ( ((j+ y)<0) || ((j + y )>= static_cast(output_y.size())) ) 501 | { 502 | break; 503 | } 504 | for(int x=(1- kernel_size.first)/2 ; (kernel_size.first + 1)/2 > x; x++) 505 | { 506 | 507 | if (((i+ x)<0) || ((i + x)>=static_cast(output_y[0].size()) ) ) 508 | { 509 | break; 510 | } 511 | output_y[j+y][i+x] += scaled_x[j][i]; 512 | } 513 | 514 | } 515 | 516 | } 517 | } 518 | } 519 | 520 | void binaryMat2IntMat(Matrix &input_x) 521 | { 522 | for(int j=0; input_x.size()>j; j++) 523 | { 524 | for (int i=0; input_x[j].size()>i; i++) 525 | { 526 | input_x[j][i] = __builtin_popcountll(input_x[j][i]); 527 | } 528 | } 529 | } 530 | // output matrix must be have zero as defualt value 531 | template 532 | void ZeroPadding2D(Matrix &input_mat, Matrix &output_mat, std::pair kernel_size) 533 | { 534 | for(int j=0; input_mat.row>j; j++) 535 | { 536 | for(int i=0; input_mat.col>i; i++) 537 | { 538 | output_mat[j + (kernel_size.second - 1)/2][i + (kernel_size.first - 1)/2] = input_mat[j][i]; 539 | } 540 | } 541 | } 542 | // #TODO# learn std::sharedptr, Rvalue refence and std::move 543 | 544 | // A Xnor Convolution layer is made of: 545 | // 1- int input matrix and weight matrix to binary input matrix 546 | // 2- Calculate K matrix ([1,3], and 2 can be concurrent execution) 547 | // 3- Binary Convolution 548 | // 4- Hash table conversion 549 | // 5- Output of Hash table * K matrix 550 | // 6- scalar * output of [5] 551 | // 7- Result Matrix 552 | 553 | // A xnor convolution function needs input and weight matrix, alpha, hash table(Can calculate inside but performance) 554 | // A xnor convolution outputs result matrix 555 | // xnor_convolution does not include pooling if needed ended padded input image. 556 | // A padding can be added to xnor_convolution to increase performance 557 | // however it may cause some unstabilities and need testing and more time. 558 | 559 | 560 | template 561 | void MatTypeCasting(Matrix& input_mat, Matrix& output_mat) 562 | { 563 | 564 | for (int j=0; input_mat.col>j; j++) 565 | { 566 | for(int i=0; input_mat.row>i; i++) 567 | { 568 | output_mat[j][i] = static_cast(input_mat[j][i]); 569 | } 570 | } 571 | } 572 | 573 | template 574 | void xnor_convoltion_op(Matrix input_matrix, Matrix &output_matrix, Matrix &weight_matrix, 575 | bool padding = true) 576 | { 577 | auto kernel_size = std::make_pair(weight_matrix.col, weight_matrix.row); 578 | if (padding == true) 579 | { 580 | Matrix padded_matrix(input_matrix.row + kernel_size.first -1, input_matrix.col + kernel_size.second -1); 581 | ZeroPadding2D(input_matrix, padded_matrix, kernel_size); 582 | input_matrix = padded_matrix; 583 | } 584 | 585 | 586 | auto binary_input_matrix = BinaryMatMemoryAllocation(std::make_pair(input_matrix.row, input_matrix.col), kernel_size); 587 | auto binary_weight_matrix = BinaryMatMemoryAllocation(std::make_pair(weight_matrix.row, weight_matrix.col), kernel_size); 588 | { 589 | auto start = std::chrono::high_resolution_clock::now(); 590 | intMat2BinaryMat(input_matrix, binary_input_matrix, kernel_size); 591 | auto stop = std::chrono::high_resolution_clock::now(); 592 | std::chrono::duration multi_core(stop - start); 593 | 594 | total_time += multi_core.count(); 595 | //std::cout<<"int2binary Performance : "<(weight_matrix, binary_weight_matrix, kernel_size); 599 | Matrix output_mat(input_matrix.col - weight_matrix.col + 1, input_matrix.row - weight_matrix.row + 1); 600 | { 601 | auto start = std::chrono::high_resolution_clock::now(); 602 | binaryConv2D(binary_input_matrix, output_mat, binary_weight_matrix[0][0], 603 | kernel_size, std::make_pair(input_matrix.col, input_matrix.row)); 604 | auto stop = std::chrono::high_resolution_clock::now(); 605 | 606 | std::chrono::duration multi_core(stop - start); 607 | 608 | total_time += multi_core.count(); 609 | 610 | //std::cout<<"Xnor Convolution Kernel Performance : "< multi_core(stop - start); 617 | //std::cout<<"Binary2int Performance : "<(output_mat, output_matrix); 622 | } 623 | 624 | template 625 | Tensor xnor_convoltion(Tensor &input_tensor, Weight &weight, std::vector &alpha, 626 | std::unordered_map hash_map, bool padding = true) 627 | { 628 | int output_row; 629 | int output_col; 630 | int output_channel; 631 | 632 | if (padding == true) 633 | { 634 | output_row = input_tensor.row; 635 | output_col = input_tensor.col; 636 | output_channel = weight.channel_out; 637 | } 638 | 639 | else 640 | { 641 | output_row = input_tensor.row + 1 - weight.row; 642 | output_col = input_tensor.col + 1 - weight.col; 643 | output_channel = weight.channel_out; 644 | } 645 | Tensor output_tensor(output_row, output_col, output_channel); 646 | 647 | auto A = tensorChannelSum(input_tensor); 648 | Matrix padded_matrix(output_tensor.row + weight.row -1, output_tensor.col + weight.col -1); 649 | ZeroPadding2D(A, padded_matrix, std::make_pair(weight.row, weight.col)); 650 | Matrix K(padded_matrix.col - weight.col + 1, padded_matrix.row - weight.row + 1); 651 | auto start = std::chrono::high_resolution_clock::now(); 652 | conv2D(padded_matrix, K, std::make_pair(weight.row, weight.col)); 653 | auto stop = std::chrono::high_resolution_clock::now(); 654 | std::chrono::duration multi_core(stop - start); 655 | total_time += multi_core.count(); 656 | { 657 | int in, out; 658 | Weight output_tensor_buffer(output_tensor.row, output_tensor.col, weight.channel_in, weight.channel_out); 659 | auto start = std::chrono::high_resolution_clock::now(); 660 | 661 | #pragma omp parallel private(in, out) shared(input_tensor, output_tensor_buffer) 662 | { 663 | #pragma omp for schedule(dynamic,50) collapse(2) 664 | for(int out=0; weight.channel_out>out; out++) 665 | { 666 | 667 | for (int in=0; weight.channel_in>in; in++) 668 | { 669 | xnor_convoltion_op(input_tensor[in], output_tensor_buffer[out][in], weight[out][in]); 670 | 671 | } 672 | // auto A = tensorChannelSum(input_tensor); 673 | // output_tensor[out] *= K; 674 | // output_tensor[out] *= alpha[out]; 675 | } 676 | } 677 | auto stop = std::chrono::high_resolution_clock::now(); 678 | std::chrono::duration multi_core(stop - start); 679 | //std::cout<<"Multicore xnor Kernel Performance : "<out; out++) 684 | { 685 | tensorChannelSum(output_tensor_buffer[out], output_tensor[out]); 686 | output_tensor[out] *= K; 687 | output_tensor[out] *= alpha[out]; 688 | 689 | } 690 | } 691 | } 692 | return output_tensor; 693 | } 694 | int main() 695 | // add variable test range like 128, 256, 512, 1024 696 | // add const variable size 512 with kernel size 1, 3 697 | { 698 | int row[] = {128, 256, 512, 1024, 2048}; 699 | int col[] = {128, 256, 512, 1024, 2048}; 700 | int kernel = 3; 701 | int channel_in = 1; 702 | int channel_out = 1; 703 | int total_test_count = 100; 704 | for(int index = 0; index<5; ++index) 705 | { 706 | for(int iter = 0; iter input_tensor(width, height, 1); 711 | Weight weight(3, 3, input_tensor.channel, 1); 712 | std::vector scalar(weight.channel_out); 713 | std::pair kernel_size(weight.row, weight.col); 714 | // Random initilizate the values 715 | for(int k=0; input_tensor.channel>k; k++) 716 | { 717 | for(int j=0; input_tensor.col>j; j++) 718 | { 719 | for(int i=0; input_tensor.row>i; i++) 720 | { 721 | input_tensor[k][j][i] = (std::rand()%1000 - 500); 722 | if (input_tensor[k][j][i] >= 0) 723 | { 724 | input_tensor[k][j][i] = 1; 725 | } 726 | else 727 | { 728 | input_tensor[k][j][i] = -1; 729 | } 730 | 731 | } 732 | } 733 | } 734 | for (int m=0; weight.channel_out>m; m++) 735 | { 736 | scalar[m] = static_cast(rand()) / static_cast (RAND_MAX); 737 | for(int k=0; weight.channel_in>k; k++) 738 | { 739 | 740 | for(int j=0; weight.col>j; j++) 741 | { 742 | for(int i=0; weight.row>i; i++) 743 | { 744 | weight[m][k][j][i] = (std::rand()%1000 - 500); 745 | if (weight[m][k][j][i] >= 0) 746 | { 747 | weight[m][k][j][i] = 1; 748 | } 749 | else 750 | { 751 | weight[m][k][j][i] = -1; 752 | } 753 | 754 | } 755 | } 756 | } 757 | } 758 | // Calculate hash map 759 | auto hash_map = generate_hash_map(kernel_size); 760 | auto start = std::chrono::high_resolution_clock::now(); 761 | auto output_tensor = xnor_convoltion(input_tensor, weight, scalar, hash_map); 762 | auto stop = std::chrono::high_resolution_clock::now(); 763 | std::chrono::duration multi_core(stop - start); 764 | //std::cout<<"Multi core Xnor Performance : "<(total_test_count) << " For Image Size "<< row[index] << std::endl; 768 | total_time = 0; 769 | } 770 | return 0; 771 | } 772 | 773 | -------------------------------------------------------------------------------- /xnor_gpu/xnor_gpu_v2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define NUM_STREAMS 2 16 | 17 | struct GPUTimer 18 | { 19 | GPUTimer() 20 | { 21 | cudaEventCreate(&start_); 22 | cudaEventCreate(&stop_); 23 | cudaEventRecord(start_, 0); 24 | } 25 | 26 | ~GPUTimer() 27 | { 28 | cudaEventDestroy(start_); 29 | cudaEventDestroy(stop_); 30 | } 31 | 32 | void start() 33 | { 34 | cudaEventRecord(start_, 0); 35 | } 36 | 37 | float seconds() 38 | { 39 | cudaEventRecord(stop_, 0); 40 | cudaEventSynchronize(stop_); 41 | float time; 42 | cudaEventElapsedTime(&time, start_, stop_); 43 | return time * 1e-3; 44 | } 45 | private: 46 | cudaEvent_t start_, stop_; 47 | }; 48 | 49 | // This is second version of the gpu implementation 50 | // This version a general benchmarking to compare with CPU, 51 | // Binary operations will be handled single convolution kernel to utilize register memory usage 52 | constexpr std::pair register_size(8, 4); 53 | constexpr int nTPB=256; 54 | 55 | template 56 | struct matrix1d { 57 | int lenght; 58 | T *arr; 59 | }; 60 | 61 | template 62 | struct matrix2d { 63 | int row; 64 | int col; 65 | T *arr; 66 | }; 67 | 68 | template 69 | struct matrix3d { 70 | int row; 71 | int col; 72 | int channel; 73 | T *arr; 74 | }; 75 | 76 | template 77 | struct matrix4d{ 78 | int row; 79 | int col; 80 | int channel_in; 81 | int channel_out; 82 | T *arr; 83 | }; 84 | 85 | 86 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 87 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 88 | { 89 | if (code != cudaSuccess) 90 | { 91 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 92 | if (abort) exit(code); 93 | } 94 | } 95 | 96 | 97 | std::pair find_binary_size(std::pairinput_size, std::pairkernel_size){ 98 | int size_x = ceil((input_size.first - register_size.first) 99 | /static_cast(register_size.first + 1 - kernel_size.first) + 1); 100 | int size_y = ceil((input_size.second - register_size.second ) 101 | /static_cast(register_size.second + 1 - kernel_size.second) + 1); 102 | if (size_x < 0) 103 | size_x = 1; 104 | if (size_y < 0) 105 | size_y = 1; 106 | return std::make_pair(size_x, size_y); 107 | } 108 | 109 | size_t choose_block_size(size_t val){ 110 | if (val >= nTPB) return nTPB; 111 | if (val <= 32) return 32; 112 | val = (val >> 1) | val; 113 | val = (val >> 2) | val; 114 | val = (val >> 4) | val; 115 | val = (val >> 8) | val; 116 | val = (val >> 16) | val; 117 | val++; 118 | return val; 119 | } 120 | 121 | void int2binary(float* input_x, const std::pair input_index, 122 | std::pair output_location, unsigned int &output_y, const std::pairregister_size, int input_col) 123 | { 124 | int sign = 0; 125 | long int pozitive = 1; 126 | long int negative = 0; 127 | int count = output_location.second * register_size.second + output_location.first; 128 | 129 | assert(count < register_size.second * register_size.first); 130 | 131 | for (int j=0; j 0) - (input_x[(input_index.second) * input_col+ input_index.first + i] < 0); 136 | if (sign == 1) 137 | { 138 | output_y = pozitive<= input_col) 149 | { 150 | break; 151 | } 152 | count++; 153 | } 154 | } 155 | 156 | } 157 | 158 | void intMat2BinaryMat(float *const& input_mat, unsigned int *const& binary_mat, std::pair kernel_size, int input_row, int input_col, int binary_col, int binary_row) 159 | { 160 | //float * input_mat = input_tensor.arr[i * input_tensor.channel_in + j]; 161 | //unsigned int * binary_mat = binary_tensor.arr[i * input_tensor.channel_in + j]; 162 | int index_x = 0; 163 | int index_y = 0; 164 | std::pair input_index(0, 0); 165 | std::pair output_location(0, 0); 166 | 167 | // Test 168 | while(input_row >= input_index.second) 169 | { 170 | int i = 0; 171 | input_index.first = 0; 172 | index_x = 0; 173 | 174 | while(input_col > i) 175 | { 176 | i = input_index.first + register_size.first; 177 | int2binary(input_mat, input_index, output_location, binary_mat[index_y *binary_col + index_x], register_size, input_col); 178 | input_index.first = input_index.first + register_size.first + 1 - kernel_size.first; 179 | index_x++; 180 | 181 | } 182 | output_location.second++; 183 | input_index.second++; 184 | if(input_index.second >= input_row) 185 | { 186 | break; 187 | } 188 | if (output_location.second % register_size.second == 0) 189 | { 190 | output_location.second = 0; 191 | input_index.second = input_index.second + 1 - kernel_size.second; 192 | index_y++; 193 | } 194 | } 195 | } 196 | std::pair BinaryMatMemoryAllocation( std::pair input_size, std::pair kernel_size) 197 | { 198 | int size_x = ceil((input_size.first - register_size.first) 199 | /static_cast(register_size.first + 1 - kernel_size.first) + 1); 200 | int size_y = ceil((input_size.second - register_size.second ) 201 | /static_cast(register_size.second + 1 - kernel_size.second) + 1); 202 | if (size_x < 0) 203 | size_x = 1; 204 | if (size_y < 0) 205 | size_y = 1; 206 | 207 | return std::make_pair(size_x, size_y); 208 | } 209 | template 210 | __global__ void compK_matrix(T* input_data, T kernel_value, 211 | T* output_data, int channel_in, int width, int height) { 212 | 213 | float accum; 214 | int col = threadIdx.x + blockIdx.x * blockDim.x; //col index 215 | int row = threadIdx.y + blockIdx.y * blockDim.y; //row index 216 | int mask_row_radius = mask_rows / 2; 217 | int mask_col_radius = mask_cols / 2; 218 | 219 | 220 | for (int k = 0; k < channel_in; k++) { 221 | if (row < height && col < width) { 222 | accum = 0; 223 | int start_row = row - mask_row_radius; 224 | int start_col = col - mask_col_radius; 225 | 226 | for (int i = 0; i < mask_rows; i++) { 227 | 228 | for (int j = 0; j < mask_cols; j++) { 229 | 230 | int row_index = start_row + i; 231 | int col_index = start_col + j; 232 | 233 | if (row_index >= 0 && row_index < height && col_index >= 0 && col_index < width) { 234 | 235 | accum += input_data[(row_index * width + col_index) * channel_in + k] * 236 | kernel_value; 237 | } 238 | else accum += 0; 239 | } 240 | 241 | } 242 | output_data[(row * width + col) * channel_in + k] = accum; 243 | } 244 | 245 | } 246 | } 247 | 248 | void __global__ zeroPadding(float* input_tensor, float* output_tensor, int kernel_row, int kernel_col, int input_col, int input_row, int output_col, int output_row, int output_channel) 249 | { 250 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 251 | int op_buffer = idx / output_col; // simple buffer for same operation 252 | int index_x = (idx % output_col) - (kernel_col - 1)/ 2; 253 | int index_y = op_buffer%output_row - (kernel_row - 1)/ 2; 254 | int index_z = op_buffer / output_row; 255 | if (idx< output_row * output_col * output_channel) 256 | { 257 | if(index_x >= 0 && index_y >= 0 ) 258 | { 259 | if( index_x < input_col && index_y < input_row ) 260 | { 261 | output_tensor[idx] = input_tensor[(index_z * input_col * input_row ) + ( index_y * input_col ) + index_x]; 262 | } 263 | } 264 | else { 265 | output_tensor[idx] = 0; 266 | } 267 | } 268 | } 269 | 270 | void __global__ kernel_sum( 271 | const unsigned int * d_idata, 272 | float * d_odata, 273 | const int col, 274 | const int row, 275 | const int channel_in, 276 | const int channel_out) 277 | { 278 | int idx = threadIdx.x+blockDim.x*blockIdx.x; 279 | if (idx < (col * row * channel_out)) 280 | { 281 | 282 | int tidx = idx%(col*row) + ((idx/(col*row) ) *(col * row * channel_in) ); // indexing for 4 dim , since kernel must sum values with same channel out 283 | int tsum = 0; 284 | #pragma unroll 285 | for (int i = 0; i < channel_in; i++) 286 | { 287 | tsum += d_idata[tidx]; 288 | tidx += row * col; 289 | } 290 | d_odata[idx] = static_cast(tsum);// / static_cast(channel_in); 291 | } 292 | } 293 | 294 | template 295 | __device__ void to_binary_register( 296 | const T &idata, 297 | unsigned int &odata, 298 | int *output_location) 299 | { 300 | int sign = (idata > 0) - (idata < 0); 301 | const unsigned int pozitive = 1; 302 | const unsigned int negative = 0; 303 | //int count = output_location[1] * register_size.second + output_location[0]; 304 | //assert(count < register_size.second * register_size.first); 305 | if (sign > -1) 306 | { 307 | odata = pozitive<<(output_location[1] * register_size.first + output_location[0]) | odata; 308 | } 309 | else 310 | { 311 | odata = negative<<(output_location[1] * register_size.first + output_location[0]) | odata; 312 | } 313 | } 314 | 315 | template 316 | void __global__ convert2binary( 317 | const T * d_idata, 318 | unsigned int * d_odata, 319 | const int row, const int b_row, 320 | const int col, const int b_col, 321 | const int channel, 322 | const int kernel_row = 3, const int kernel_col = 3) 323 | { 324 | // Each thread will store a size = 32 array inside their single register 325 | int idx = threadIdx.x+blockDim.x*blockIdx.x; //register IDX 326 | // n*(regsiter_size - kernel_size) 327 | if (idx < (b_row * b_col * channel)) 328 | { 329 | 330 | int input_index[] = {(idx%b_col) * (register_size.first - kernel_col), ((idx/b_col) % b_row)* (register_size.second - kernel_row), (idx/(b_col * b_row) )}; // x, y ,z 331 | int data_idx = input_index[0] + (input_index[1] * col) + (input_index[2] * row * col); 332 | //int input_index[] = {data_idx%row, data_idx/col, data_idx/(row*col)}; // from start of array , (x, y, z) 333 | int register_location[] = {0, 0}; 334 | unsigned int local_register = 0; 335 | for (int j=0; register_size.second>j; j++) 336 | { 337 | for (int i=0; register_size.first>i; i++) 338 | { 339 | to_binary_register(d_idata[data_idx], local_register, register_location); 340 | ++data_idx; 341 | input_index[0] += 1; 342 | register_location[0] += 1; 343 | if (input_index[0] == col) break; 344 | } 345 | data_idx = data_idx + col - register_location[0]; 346 | input_index[1] += 1; 347 | input_index[0] = (idx%b_col) * (register_size.first - kernel_col); 348 | register_location[0] = 0; 349 | register_location[1] += 1; 350 | if (input_index[1] == row) break; 351 | } 352 | d_odata[idx] = local_register; 353 | } 354 | } 355 | template 356 | void __global__ scalar_multiplication(T* __restrict__ d_idata, const T __restrict__ scalar, const int height, const int width) 357 | { 358 | int idx = threadIdx.x+blockDim.x*blockIdx.x; 359 | if (idxj; ++j) 381 | { 382 | for(int i=0; kernel_col>i; ++i) 383 | { 384 | buffer += (idata >> shifter) & mask; 385 | ++shifter; 386 | } 387 | shifter += register_size.first - kernel_col; 388 | } 389 | odata = 2 * buffer - (kernel_row * kernel_col); 390 | } 391 | 392 | 393 | void __global__ binaryConv2d( 394 | const unsigned int * input_tensor, 395 | unsigned int * output_tensor, 396 | const unsigned int * weight_tensor, 397 | int input_row, int input_col, 398 | int kernel_row, int kernel_col, 399 | int output_row, int output_col, 400 | int channel_in, int channel_out 401 | ) 402 | { 403 | 404 | int idx = threadIdx.x +blockDim.x*blockIdx.x; 405 | int conv_per_row = register_size.second - (kernel_row - 1); 406 | int conv_per_column = register_size.first - (kernel_col - 1); 407 | int output_index_x = (idx % input_col) * conv_per_column; 408 | int output_index_y = ((idx / input_col) % input_row) * conv_per_row; 409 | 410 | if (idx < input_row * input_col * channel_in * channel_out) 411 | { 412 | unsigned int register_buffer = input_tensor[idx % (input_row * input_col * channel_in)]; 413 | if ( (output_index_x + conv_per_column) > output_col) 414 | { 415 | conv_per_column = output_col - output_index_x; 416 | } 417 | if ( (output_index_y + conv_per_row) > output_row) 418 | { 419 | conv_per_row = output_row - output_index_y; 420 | } 421 | 422 | unsigned int mask = std::pow(2, kernel_col) - 1; 423 | for (int j=1; kernel_row > j; j++) 424 | { 425 | mask = (mask<(std::pow(2, kernel_col) - 1); 426 | } 427 | int default_index = (idx / (input_row * input_col) ) * (output_col * output_row); 428 | auto weight_index = idx / (input_row * input_col); 429 | unsigned int shifter = 0; 430 | for (int j=0; conv_per_row>j; ++j) 431 | { 432 | for (int i=0; conv_per_column>i; ++i) 433 | { 434 | unsigned int buffer = (~(register_buffer>>shifter) ^ (weight_tensor[weight_index]) ) & mask; 435 | binary2int(buffer, output_tensor[default_index + (output_index_y+j)*output_col + output_index_x + i], kernel_row, kernel_col); 436 | ++shifter; 437 | } 438 | // Check if register is not fully filled, 439 | // if not add shifter the missing shift amount 440 | shifter += register_size.second - conv_per_column; 441 | } 442 | } 443 | 444 | } 445 | 446 | 447 | 448 | 449 | 450 | // This part must be updated to concurrent execution 451 | void xnor_convolution(matrix3d &h_input_tensor, matrix4d &h_weight_tensor, matrix3d &h_output_tensor, const float alpha, int kernel_row, int kernel_col, bool padding=true) 452 | { 453 | 454 | cudaEvent_t start, stop; 455 | cudaEvent_t start1, stop1; 456 | cudaEvent_t start2, stop2; 457 | cudaEventCreate(&start2); 458 | cudaEventCreate(&stop2); 459 | cudaEventCreate(&start); 460 | cudaEventCreate(&stop); 461 | cudaEventCreate(&start1); 462 | cudaEventCreate(&stop1); 463 | 464 | 465 | matrix3d d_input_tensor; 466 | d_input_tensor.col = h_input_tensor.col; 467 | d_input_tensor.row = h_input_tensor.row; 468 | d_input_tensor.channel = h_input_tensor.channel; 469 | auto copy_size = sizeof(float) * d_input_tensor.col* d_input_tensor.row * d_input_tensor.channel; 470 | cudaMalloc((void **)&d_input_tensor.arr, copy_size); 471 | cudaMemcpy(d_input_tensor.arr, h_input_tensor.arr, copy_size, cudaMemcpyHostToDevice); 472 | // 473 | // Calculate K matrix 474 | // Use async steam2 475 | cudaStream_t stream1; 476 | cudaStreamCreate(&stream1); 477 | matrix2d d_K_matrix; 478 | d_K_matrix.col = h_input_tensor.col; 479 | d_K_matrix.row = h_input_tensor.row; 480 | copy_size = sizeof(float) * d_K_matrix.col* d_K_matrix.row; 481 | cudaMalloc((void **)&d_K_matrix.arr, copy_size); 482 | const float kernel_value = 1.0 / static_cast(h_weight_tensor.row * h_weight_tensor.col); 483 | auto block_size = choose_block_size(h_input_tensor.row * h_input_tensor.col); 484 | auto grid_size = (h_input_tensor.row * h_input_tensor.col+ block_size - 1)/block_size; 485 | compK_matrix<<>>(d_input_tensor.arr, kernel_value, 486 | d_K_matrix.arr, d_input_tensor.channel, d_input_tensor.width, d_input_tensor.height); 487 | // 488 | scalar_multiplication<<>>(d_K_matrix.arr, alpha, height, width); 489 | matrix3d d_padded_input_tensor; 490 | d_padded_input_tensor.row = h_input_tensor.row + kernel_row - 1; 491 | d_padded_input_tensor.col = h_input_tensor.col + kernel_col - 1; 492 | d_padded_input_tensor.channel = h_input_tensor.channel; 493 | copy_size = sizeof(float) * d_padded_input_tensor.row * d_padded_input_tensor.col * d_padded_input_tensor.channel; 494 | gpuErrchk(cudaMalloc((void **)&d_padded_input_tensor.arr, copy_size)); 495 | 496 | block_size = choose_block_size(d_padded_input_tensor.row * d_padded_input_tensor.col * d_padded_input_tensor.channel); 497 | grid_size = (d_padded_input_tensor.row * d_padded_input_tensor.col * d_padded_input_tensor.channel + block_size - 1)/block_size; 498 | zeroPadding<<>>(d_input_tensor.arr, d_padded_input_tensor.arr, kernel_row, kernel_col, d_input_tensor.col, d_input_tensor.row, d_padded_input_tensor.row, d_padded_input_tensor.col, d_padded_input_tensor.channel); 499 | //cudaFree(d_input_tensor.arr); 500 | auto binary_size = find_binary_size(std::make_pair(h_input_tensor.col, h_input_tensor.row), std::make_pair(kernel_col, kernel_row)); 501 | 502 | matrix3d d_binary_input_tensor; 503 | d_binary_input_tensor.row = binary_size.second; 504 | d_binary_input_tensor.col = binary_size.first; 505 | d_binary_input_tensor.channel = d_padded_input_tensor.channel; 506 | copy_size = sizeof(unsigned int) * d_binary_input_tensor.row * d_binary_input_tensor.col * d_binary_input_tensor.channel; 507 | 508 | gpuErrchk(cudaMalloc((void **)&d_binary_input_tensor.arr, copy_size)); 509 | cudaEventRecord(start, 0); 510 | convert2binary<<>>(d_padded_input_tensor.arr, d_binary_input_tensor.arr, 511 | d_padded_input_tensor.row, d_binary_input_tensor.row, 512 | d_padded_input_tensor.col, d_binary_input_tensor.col, 513 | d_binary_input_tensor.channel, 514 | kernel_row, kernel_col); 515 | cudaEventRecord(stop, 0); 516 | cudaEventSynchronize(stop); 517 | float milliseconds = 0; 518 | cudaEventElapsedTime(&milliseconds, start, stop); 519 | std::cout<<"Int2Binary Time= "<< milliseconds< d_convolution_buffer; 522 | d_convolution_buffer.col = h_input_tensor.col; 523 | d_convolution_buffer.row = h_input_tensor.row; 524 | d_convolution_buffer.channel_in = h_input_tensor.channel; 525 | d_convolution_buffer.channel_out = h_weight_tensor.channel_out; 526 | copy_size = sizeof(unsigned int) * d_convolution_buffer.col * d_convolution_buffer.row * d_convolution_buffer.channel_in * d_convolution_buffer.channel_out; 527 | gpuErrchk(cudaMalloc((void **)& d_convolution_buffer.arr, copy_size)); 528 | matrix4d d_weight_tensor; 529 | d_weight_tensor.row = h_weight_tensor.row; 530 | d_weight_tensor.col = h_weight_tensor.col; 531 | d_weight_tensor.channel_in = h_weight_tensor.channel_in; 532 | d_weight_tensor.channel_out = h_weight_tensor.channel_out; 533 | copy_size = sizeof(unsigned int) * d_weight_tensor.row *d_weight_tensor.col * d_weight_tensor.channel_in * d_weight_tensor.channel_out; 534 | gpuErrchk(cudaMalloc((void**)&d_weight_tensor.arr, copy_size)); // pinned memory can be tested 535 | cudaMemcpy(d_weight_tensor.arr, h_weight_tensor.arr, copy_size, cudaMemcpyHostToDevice); 536 | block_size = choose_block_size(d_convolution_buffer.col * d_convolution_buffer.row * d_convolution_buffer.channel_in * d_convolution_buffer.channel_out); 537 | grid_size = (d_convolution_buffer.col* d_convolution_buffer.row * d_convolution_buffer.channel_in * d_convolution_buffer.channel_out + block_size - 1)/ block_size; 538 | cudaEventRecord(start1, 0); 539 | binaryConv2d<<>>(d_binary_input_tensor.arr, d_convolution_buffer.arr, d_weight_tensor.arr 540 | ,d_binary_input_tensor.row, d_binary_input_tensor.col 541 | , kernel_row, kernel_col 542 | ,d_convolution_buffer.row, d_convolution_buffer.col 543 | ,d_convolution_buffer.channel_in, d_convolution_buffer.channel_out 544 | ); 545 | cudaEventRecord(stop1, 0); 546 | cudaEventSynchronize(stop1); 547 | cudaEventElapsedTime(&milliseconds, start1, stop1); 548 | std::cout<<"Convolution Time= "<< milliseconds< d_output_tensor; 551 | d_output_tensor.col = h_output_tensor.col; 552 | d_output_tensor.row = h_output_tensor.row; 553 | d_output_tensor.channel = h_output_tensor.channel; 554 | copy_size = sizeof(float) * d_output_tensor.row * d_output_tensor.col * d_output_tensor.channel; 555 | cudaMalloc((void**)&d_output_tensor.arr, copy_size); 556 | block_size = choose_block_size(d_output_tensor.row * d_output_tensor.col * d_output_tensor.channel); 557 | grid_size = (d_output_tensor.row * d_output_tensor.col * d_output_tensor.channel + block_size - 1) / block_size; 558 | cudaEventRecord(start2, 0); 559 | kernel_sum<<>>(d_convolution_buffer.arr, d_output_tensor.arr, d_output_tensor.col, d_output_tensor.row, d_convolution_buffer.channel_in, d_convolution_buffer.channel_out); 560 | cudaEventRecord(stop2, 0); 561 | cudaEventSynchronize(stop2); 562 | cudaEventElapsedTime(&milliseconds, start2, stop2); 563 | std::cout<<"Summation Time= "<< milliseconds<>>(); 568 | //cudaFree(d_convolution_buffer.arr); 569 | cudaMemcpy(h_output_tensor.arr, d_output_tensor.arr, copy_size, cudaMemcpyDeviceToHost); 570 | //cudaFree(d_output_tensor.arr); 571 | cudaEventDestroy(start); 572 | cudaEventDestroy(stop); 573 | cudaEventDestroy(start1); 574 | cudaEventDestroy(stop1); 575 | cudaEventDestroy(start2); 576 | cudaEventDestroy(stop2); 577 | 578 | return; 579 | 580 | } 581 | 582 | 583 | 584 | int main() 585 | { 586 | int row = 512; 587 | int col = 512; 588 | int kernel_row = 3; 589 | int kernel_col = 3; 590 | 591 | int channel_in = 1; 592 | int channel_out = 1; 593 | matrix3d input_tensor; 594 | matrix4d weight_tensor; 595 | input_tensor.row = row; 596 | input_tensor.col = col; 597 | input_tensor.channel = channel_in; 598 | // Init Matrices 599 | input_tensor.arr = new float [input_tensor.channel * input_tensor.row * input_tensor.col]; 600 | weight_tensor.row = kernel_row; 601 | weight_tensor.col = kernel_col; 602 | weight_tensor.channel_in = channel_in; 603 | weight_tensor.channel_out = channel_out; 604 | weight_tensor.arr = new float [weight_tensor.channel_in * weight_tensor.channel_out * weight_tensor.row * weight_tensor.col]; 605 | 606 | bool padding = true; 607 | // Default Values 608 | for(int i=0; input_tensor.channel > i; ++i) 609 | { 610 | for (int j=0; input_tensor.col * input_tensor.row> j; ++j) 611 | { 612 | input_tensor.arr[i * input_tensor.col * input_tensor.row + j] = (rand() % 50) - 25; 613 | } 614 | } 615 | for(int i=0; weight_tensor.channel_in * weight_tensor.channel_out > i; ++i) 616 | { 617 | for (int j=0; weight_tensor.col * weight_tensor.row> j; ++j) 618 | { 619 | weight_tensor.arr[i * weight_tensor.col * weight_tensor.row + j] = (rand() % 50) -25; 620 | } 621 | } 622 | // Make Weights binary as preProcessing 623 | auto weight_size = BinaryMatMemoryAllocation(std::make_pair(weight_tensor.row, weight_tensor.col), std::make_pair(weight_tensor.col, weight_tensor.row)); 624 | matrix4d binary_weight_tensor; 625 | binary_weight_tensor.col = weight_size.first; 626 | binary_weight_tensor.row = weight_size.second; 627 | binary_weight_tensor.channel_in = weight_tensor.channel_in; 628 | binary_weight_tensor.channel_out = weight_tensor.channel_out; 629 | binary_weight_tensor.arr = new unsigned int [binary_weight_tensor.channel_in * binary_weight_tensor.channel_out *binary_weight_tensor.row * binary_weight_tensor.col]; 630 | for (int i= 0; weight_tensor.channel_out > i; ++i) 631 | { 632 | for(int j=0; weight_tensor.channel_in > j; ++j) 633 | { 634 | intMat2BinaryMat(&weight_tensor.arr[(i * weight_tensor.channel_in + j) * weight_tensor.row * weight_tensor.col], &binary_weight_tensor.arr[i * weight_tensor.channel_in + j], 635 | std::make_pair(weight_tensor.col, weight_tensor.row), weight_tensor.row, weight_tensor.col, binary_weight_tensor.col, binary_weight_tensor.row); 636 | } 637 | } 638 | delete weight_tensor.arr; 639 | // A sample layer 640 | matrix3d output_tensor; 641 | output_tensor.col = input_tensor.col; 642 | output_tensor.row = input_tensor.row; 643 | output_tensor.channel = input_tensor.channel; 644 | output_tensor.arr = new float [input_tensor.col* input_tensor.row * input_tensor.channel]; 645 | xnor_convolution(input_tensor, binary_weight_tensor, output_tensor, weight_tensor.row, weight_tensor.col ,padding); 646 | 647 | delete[] input_tensor.arr; 648 | delete[] binary_weight_tensor.arr; 649 | delete[] output_tensor.arr; 650 | return 0; 651 | } 652 | 653 | 654 | 655 | --------------------------------------------------------------------------------