├── README.md ├── Reduction ├── cudastart.h ├── readme.md ├── reduction └── reduction.cu ├── Reduction2 ├── cudastart.h ├── readme.md ├── reduction2 └── reduction2.cu ├── Reduction3 ├── cudastart.h ├── readme.md ├── reduction3 └── reduction3.cu └── Sum_Matrix ├── cudastart.h ├── readme.md └── sum_martix.cu /README.md: -------------------------------------------------------------------------------- 1 | # CUDA_study 2 | 3 | Here are the codes of https://zhuanlan.zhihu.com/c_1188568938097819648 4 | -------------------------------------------------------------------------------- /Reduction/cudastart.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDASTART_H 2 | #define CUDASTART_H 3 | #define CHECK(call)\ 4 | {\ 5 | const cudaError_t error=call;\ 6 | if(error!=cudaSuccess)\ 7 | {\ 8 | printf("ERROR: %s:%d,",__FILE__,__LINE__);\ 9 | printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ 10 | exit(1);\ 11 | }\ 12 | } 13 | 14 | 15 | #include 16 | #ifdef _WIN32 17 | # include 18 | #else 19 | # include 20 | #endif 21 | 22 | double cpuSecond() 23 | { 24 | struct timeval tp; 25 | gettimeofday(&tp,NULL); 26 | return((double)tp.tv_sec+(double)tp.tv_usec*1e-6); 27 | 28 | } 29 | 30 | void initialData(float* ip,int size) 31 | { 32 | time_t t; 33 | srand((unsigned )time(&t)); 34 | for(int i=0;iepsilon) 66 | { 67 | printf("Results don\'t match!\n"); 68 | printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i); 69 | return; 70 | } 71 | } 72 | printf("Check result success!\n"); 73 | } 74 | 75 | #endif 76 | 77 | -------------------------------------------------------------------------------- /Reduction/readme.md: -------------------------------------------------------------------------------- 1 | 2 | https://zhuanlan.zhihu.com/p/98190609 3 | -------------------------------------------------------------------------------- /Reduction/reduction: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZihaoZhao/CUDA_study/1fc8a7bc770593e4e23ce2f08b6546ad6c111e71/Reduction/reduction -------------------------------------------------------------------------------- /Reduction/reduction.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cudastart.h" 4 | int recursiveReduce(int *data, int const size) 5 | { 6 | // terminate check 7 | if (size == 1) return data[0]; 8 | // renew the stride 9 | int const stride = size / 2; 10 | if (size % 2 == 1) 11 | { 12 | for (int i = 0; i < stride; i++) 13 | { 14 | data[i] += data[i + stride]; 15 | } 16 | data[0] += data[size - 1]; 17 | } 18 | else 19 | { 20 | for (int i = 0; i < stride; i++) 21 | { 22 | data[i] += data[i + stride]; 23 | } 24 | } 25 | // call 26 | return recursiveReduce(data, stride); 27 | } 28 | 29 | 30 | 31 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 32 | { 33 | //set thread ID 34 | unsigned int tid = threadIdx.x; 35 | //boundary check 36 | if (tid >= n) return; 37 | //convert global data pointer to the 38 | int *idata = g_idata + blockIdx.x*blockDim.x; 39 | //in-place reduction in global memory 40 | for (int stride = 1; stride < blockDim.x; stride *= 2) 41 | { 42 | if ((tid % (2 * stride)) == 0) 43 | { 44 | idata[tid] += idata[tid + stride]; 45 | } 46 | //synchronize within block 47 | __syncthreads(); 48 | } 49 | //write result for this block to global mem 50 | if (tid == 0) 51 | g_odata[blockIdx.x] = idata[0]; 52 | 53 | } 54 | 55 | int main(int argc,char** argv) 56 | { 57 | initDevice(0); 58 | 59 | //initialization 60 | 61 | int size = 1 << 24; 62 | printf(" with array size %d ", size); 63 | 64 | //execution configuration 65 | int blocksize = 1024; 66 | if (argc > 1) 67 | { 68 | blocksize = atoi(argv[1]); //从命令行输入设置block大小 69 | } 70 | dim3 block(blocksize, 1); 71 | dim3 grid((size - 1) / block.x + 1, 1); 72 | printf("grid %d block %d \n", grid.x, block.x); 73 | 74 | //allocate host memory 75 | size_t bytes = size * sizeof(int); 76 | int *idata_host = (int*)malloc(bytes); 77 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 78 | int * tmp = (int*)malloc(bytes); 79 | 80 | //initialize the array 81 | initialData_int(idata_host, size); 82 | 83 | memcpy(tmp, idata_host, bytes); 84 | double timeStart, timeElaps; 85 | int gpu_sum = 0; 86 | 87 | // device memory 88 | int * idata_dev = NULL; 89 | int * odata_dev = NULL; 90 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 91 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 92 | 93 | //cpu reduction 对照组 94 | int cpu_sum = 0; 95 | timeStart = cpuSecond(); 96 | //cpu_sum = recursiveReduce(tmp, size); 97 | for (int i = 0; i < size; i++) 98 | cpu_sum += tmp[i]; 99 | timeElaps = 1000*(cpuSecond() - timeStart); 100 | 101 | printf("cpu sum:%d \n", cpu_sum); 102 | printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum); 103 | 104 | 105 | //kernel reduceNeighbored 106 | 107 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 108 | CHECK(cudaDeviceSynchronize()); 109 | timeStart = cpuSecond(); 110 | reduceNeighbored <<>>(idata_dev, odata_dev, size); 111 | cudaDeviceSynchronize(); 112 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 113 | gpu_sum = 0; 114 | for (int i = 0; i < grid.x; i++) 115 | gpu_sum += odata_host[i]; 116 | timeElaps = 1000*(cpuSecond() - timeStart); 117 | 118 | printf("gpu sum:%d \n", gpu_sum); 119 | printf("gpu reduceNeighbored elapsed %lf ms <<>>\n", 120 | timeElaps, grid.x, block.x); 121 | 122 | // free host memory 123 | 124 | free(idata_host); 125 | free(odata_host); 126 | CHECK(cudaFree(idata_dev)); 127 | CHECK(cudaFree(odata_dev)); 128 | 129 | //reset device 130 | cudaDeviceReset(); 131 | 132 | //check the results 133 | if (gpu_sum == cpu_sum) 134 | { 135 | printf("Test success!\n"); 136 | } 137 | return EXIT_SUCCESS; 138 | } 139 | -------------------------------------------------------------------------------- /Reduction2/cudastart.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDASTART_H 2 | #define CUDASTART_H 3 | #define CHECK(call)\ 4 | {\ 5 | const cudaError_t error=call;\ 6 | if(error!=cudaSuccess)\ 7 | {\ 8 | printf("ERROR: %s:%d,",__FILE__,__LINE__);\ 9 | printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ 10 | exit(1);\ 11 | }\ 12 | } 13 | 14 | 15 | #include 16 | #ifdef _WIN32 17 | # include 18 | #else 19 | # include 20 | #endif 21 | 22 | double cpuSecond() 23 | { 24 | struct timeval tp; 25 | gettimeofday(&tp,NULL); 26 | return((double)tp.tv_sec+(double)tp.tv_usec*1e-6); 27 | 28 | } 29 | 30 | void initialData(float* ip,int size) 31 | { 32 | time_t t; 33 | srand((unsigned )time(&t)); 34 | for(int i=0;iepsilon) 66 | { 67 | printf("Results don\'t match!\n"); 68 | printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i); 69 | return; 70 | } 71 | } 72 | printf("Check result success!\n"); 73 | } 74 | 75 | #endif 76 | 77 | -------------------------------------------------------------------------------- /Reduction2/readme.md: -------------------------------------------------------------------------------- 1 | https://zhuanlan.zhihu.com/p/98416987 2 | -------------------------------------------------------------------------------- /Reduction2/reduction2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZihaoZhao/CUDA_study/1fc8a7bc770593e4e23ce2f08b6546ad6c111e71/Reduction2/reduction2 -------------------------------------------------------------------------------- /Reduction2/reduction2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cudastart.h" 4 | int recursiveReduce(int *data, int const size) 5 | { 6 | // terminate check 7 | if (size == 1) return data[0]; 8 | // renew the stride 9 | int const stride = size / 2; 10 | if (size % 2 == 1) 11 | { 12 | for (int i = 0; i < stride; i++) 13 | { 14 | data[i] += data[i + stride]; 15 | } 16 | data[0] += data[size - 1]; 17 | } 18 | else 19 | { 20 | for (int i = 0; i < stride; i++) 21 | { 22 | data[i] += data[i + stride]; 23 | } 24 | } 25 | // call 26 | return recursiveReduce(data, stride); 27 | } 28 | 29 | 30 | 31 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 32 | { 33 | //set thread ID 34 | unsigned int tid = threadIdx.x; 35 | //boundary check 36 | if (tid >= n) return; 37 | //convert global data pointer to the 38 | int *idata = g_idata + blockIdx.x*blockDim.x; 39 | //in-place reduction in global memory 40 | for (int stride = 1; stride < blockDim.x; stride *= 2) 41 | { 42 | if ((tid % (2 * stride)) == 0) 43 | { 44 | idata[tid] += idata[tid + stride]; 45 | } 46 | //synchronize within block 47 | __syncthreads(); 48 | } 49 | //write result for this block to global mem 50 | if (tid == 0) 51 | g_odata[blockIdx.x] = idata[0]; 52 | 53 | } 54 | 55 | __global__ void reduceNeighboredLess(int * g_idata,int *g_odata,unsigned int n) 56 | { 57 | unsigned int tid = threadIdx.x; 58 | unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 59 | // convert global data pointer to the local point of this block 60 | int *idata = g_idata + blockIdx.x*blockDim.x; 61 | if (idx > n) 62 | return; 63 | //in-place reduction in global memory 64 | for (int stride = 1; stride < blockDim.x; stride *= 2) 65 | { 66 | //convert tid into local array index 67 | int index = 2 * stride *tid; 68 | if (index < blockDim.x) 69 | { 70 | idata[index] += idata[index + stride]; 71 | } 72 | __syncthreads(); 73 | } 74 | //write result for this block to global men 75 | if (tid == 0) 76 | g_odata[blockIdx.x] = idata[0]; 77 | } 78 | 79 | __global__ void reduceInterleaved(int * g_idata, int *g_odata, unsigned int n) 80 | { 81 | unsigned int tid = threadIdx.x; 82 | unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 83 | // convert global data pointer to the local point of this block 84 | int *idata = g_idata + blockIdx.x*blockDim.x; 85 | if (idx >= n) 86 | return; 87 | //in-place reduction in global memory 88 | for (int stride = blockDim.x/2; stride >0; stride >>=1) 89 | { 90 | 91 | if (tid 1) 114 | { 115 | blocksize = atoi(argv[1]); //从命令行输入设置block大小 116 | } 117 | dim3 block(blocksize, 1); 118 | dim3 grid((size - 1) / block.x + 1, 1); 119 | printf("grid %d block %d \n", grid.x, block.x); 120 | 121 | //allocate host memory 122 | size_t bytes = size * sizeof(int); 123 | int *idata_host = (int*)malloc(bytes); 124 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 125 | int * tmp = (int*)malloc(bytes); 126 | 127 | //initialize the array 128 | initialData_int(idata_host, size); 129 | 130 | memcpy(tmp, idata_host, bytes); 131 | double timeStart, timeElaps; 132 | int gpu_sum = 0; 133 | 134 | // device memory 135 | int * idata_dev = NULL; 136 | int * odata_dev = NULL; 137 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 138 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 139 | 140 | //cpu reduction 对照组 141 | int cpu_sum = 0; 142 | timeStart = cpuSecond(); 143 | //cpu_sum = recursiveReduce(tmp, size); 144 | for (int i = 0; i < size; i++) 145 | cpu_sum += tmp[i]; 146 | timeElaps = 1000*(cpuSecond() - timeStart); 147 | 148 | printf("cpu sum:%d \n", cpu_sum); 149 | printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum); 150 | 151 | 152 | //kernel 1 reduceNeighbored 153 | 154 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 155 | CHECK(cudaDeviceSynchronize()); 156 | timeStart = cpuSecond(); 157 | reduceNeighbored <<>>(idata_dev, odata_dev, size); 158 | cudaDeviceSynchronize(); 159 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 160 | gpu_sum = 0; 161 | for (int i = 0; i < grid.x; i++) 162 | gpu_sum += odata_host[i]; 163 | timeElaps = 1000*(cpuSecond() - timeStart); 164 | 165 | printf("gpu sum:%d \n", gpu_sum); 166 | printf("gpu reduceNeighbored elapsed %lf ms <<>>\n", 167 | timeElaps, grid.x, block.x); 168 | 169 | //kernel 2 reduceNeighboredless 170 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 171 | CHECK(cudaDeviceSynchronize()); 172 | timeStart = cpuSecond(); 173 | reduceNeighboredLess <<>>(idata_dev, odata_dev, size); 174 | cudaDeviceSynchronize(); 175 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 176 | gpu_sum = 0; 177 | for (int i = 0; i < grid.x; i++) 178 | gpu_sum += odata_host[i]; 179 | timeElaps = 1000*(cpuSecond() - timeStart); 180 | 181 | printf("gpu sum:%d \n", gpu_sum); 182 | printf("gpu reduceNeighboredless elapsed %lf ms <<>>\n", 183 | timeElaps, grid.x, block.x); 184 | 185 | //kernel 3 reduceInterleaved 186 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 187 | CHECK(cudaDeviceSynchronize()); 188 | timeStart = cpuSecond(); 189 | reduceInterleaved <<>>(idata_dev, odata_dev, size); 190 | cudaDeviceSynchronize(); 191 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 192 | gpu_sum = 0; 193 | for (int i = 0; i < grid.x; i++) 194 | gpu_sum += odata_host[i]; 195 | timeElaps = 1000*(cpuSecond() - timeStart); 196 | 197 | printf("gpu sum:%d \n", gpu_sum); 198 | printf("gpu reduceInterleaved elapsed %lf ms <<>>\n", 199 | timeElaps, grid.x, block.x); 200 | 201 | // free host memory 202 | 203 | free(idata_host); 204 | free(odata_host); 205 | CHECK(cudaFree(idata_dev)); 206 | CHECK(cudaFree(odata_dev)); 207 | 208 | //reset device 209 | cudaDeviceReset(); 210 | 211 | //check the results 212 | if (gpu_sum == cpu_sum) 213 | { 214 | printf("Test success!\n"); 215 | } 216 | return EXIT_SUCCESS; 217 | } 218 | -------------------------------------------------------------------------------- /Reduction3/cudastart.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDASTART_H 2 | #define CUDASTART_H 3 | #define CHECK(call)\ 4 | {\ 5 | const cudaError_t error=call;\ 6 | if(error!=cudaSuccess)\ 7 | {\ 8 | printf("ERROR: %s:%d,",__FILE__,__LINE__);\ 9 | printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ 10 | exit(1);\ 11 | }\ 12 | } 13 | 14 | 15 | #include 16 | #ifdef _WIN32 17 | # include 18 | #else 19 | # include 20 | #endif 21 | 22 | double cpuSecond() 23 | { 24 | struct timeval tp; 25 | gettimeofday(&tp,NULL); 26 | return((double)tp.tv_sec+(double)tp.tv_usec*1e-6); 27 | 28 | } 29 | 30 | void initialData(float* ip,int size) 31 | { 32 | time_t t; 33 | srand((unsigned )time(&t)); 34 | for(int i=0;iepsilon) 66 | { 67 | printf("Results don\'t match!\n"); 68 | printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i); 69 | return; 70 | } 71 | } 72 | printf("Check result success!\n"); 73 | } 74 | 75 | #endif 76 | 77 | -------------------------------------------------------------------------------- /Reduction3/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Reduction3/reduction3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZihaoZhao/CUDA_study/1fc8a7bc770593e4e23ce2f08b6546ad6c111e71/Reduction3/reduction3 -------------------------------------------------------------------------------- /Reduction3/reduction3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cudastart.h" 4 | int recursiveReduce(int *data, int const size) 5 | { 6 | // terminate check 7 | if (size == 1) return data[0]; 8 | // renew the stride 9 | int const stride = size / 2; 10 | if (size % 2 == 1) 11 | { 12 | for (int i = 0; i < stride; i++) 13 | { 14 | data[i] += data[i + stride]; 15 | } 16 | data[0] += data[size - 1]; 17 | } 18 | else 19 | { 20 | for (int i = 0; i < stride; i++) 21 | { 22 | data[i] += data[i + stride]; 23 | } 24 | } 25 | // call 26 | return recursiveReduce(data, stride); 27 | } 28 | 29 | 30 | 31 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 32 | { 33 | //set thread ID 34 | unsigned int tid = threadIdx.x; 35 | //boundary check 36 | if (tid >= n) return; 37 | //convert global data pointer to the 38 | int *idata = g_idata + blockIdx.x*blockDim.x; 39 | //in-place reduction in global memory 40 | for (int stride = 1; stride < blockDim.x; stride *= 2) 41 | { 42 | if ((tid % (2 * stride)) == 0) 43 | { 44 | idata[tid] += idata[tid + stride]; 45 | } 46 | //synchronize within block 47 | __syncthreads(); 48 | } 49 | //write result for this block to global mem 50 | if (tid == 0) 51 | g_odata[blockIdx.x] = idata[0]; 52 | 53 | } 54 | 55 | __global__ void reduceNeighboredLess(int * g_idata,int *g_odata,unsigned int n) 56 | { 57 | unsigned int tid = threadIdx.x; 58 | unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 59 | // convert global data pointer to the local point of this block 60 | int *idata = g_idata + blockIdx.x*blockDim.x; 61 | if (idx > n) 62 | return; 63 | //in-place reduction in global memory 64 | for (int stride = 1; stride < blockDim.x; stride *= 2) 65 | { 66 | //convert tid into local array index 67 | int index = 2 * stride *tid; 68 | if (index < blockDim.x) 69 | { 70 | idata[index] += idata[index + stride]; 71 | } 72 | __syncthreads(); 73 | } 74 | //write result for this block to global men 75 | if (tid == 0) 76 | g_odata[blockIdx.x] = idata[0]; 77 | } 78 | 79 | __global__ void reduceInterleaved(int * g_idata, int *g_odata, unsigned int n) 80 | { 81 | unsigned int tid = threadIdx.x; 82 | unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 83 | // convert global data pointer to the local point of this block 84 | int *idata = g_idata + blockIdx.x*blockDim.x; 85 | if (idx >= n) 86 | return; 87 | //in-place reduction in global memory 88 | for (int stride = blockDim.x/2; stride >0; stride >>=1) 89 | { 90 | 91 | if (tid = n) return; 110 | //convert global data pointer to the 111 | int *idata = g_idata + blockIdx.x*blockDim.x*2; 112 | if(idx+blockDim.x0 ; stride >>=1) 120 | { 121 | if (tid = n) return; 141 | //convert global data pointer to the 142 | int *idata = g_idata + blockIdx.x*blockDim.x*4; 143 | if(idx+blockDim.x0 ; stride >>=1) 152 | { 153 | if (tid = n) return; 173 | //convert global data pointer to the 174 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 175 | if(idx+blockDim.x0 ; stride >>=1) 189 | { 190 | if (tid = n) return; 211 | //convert global data pointer to the 212 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 213 | //unrolling 8; 214 | if(idx+7 * blockDim.x32; stride >>=1) 230 | { 231 | if (tid = n) return; 263 | //convert global data pointer to the 264 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 265 | if(idx+7 * blockDim.x=1024 && tid <512) 281 | idata[tid]+=idata[tid+512]; 282 | __syncthreads(); 283 | if(blockDim.x>=512 && tid <256) 284 | idata[tid]+=idata[tid+256]; 285 | __syncthreads(); 286 | if(blockDim.x>=256 && tid <128) 287 | idata[tid]+=idata[tid+128]; 288 | __syncthreads(); 289 | if(blockDim.x>=128 && tid <64) 290 | idata[tid]+=idata[tid+64]; 291 | __syncthreads(); 292 | //write result for this block to global mem 293 | if(tid<32) 294 | { 295 | volatile int *vmem = idata; 296 | vmem[tid]+=vmem[tid+32]; 297 | vmem[tid]+=vmem[tid+16]; 298 | vmem[tid]+=vmem[tid+8]; 299 | vmem[tid]+=vmem[tid+4]; 300 | vmem[tid]+=vmem[tid+2]; 301 | vmem[tid]+=vmem[tid+1]; 302 | 303 | } 304 | 305 | if (tid == 0) 306 | g_odata[blockIdx.x] = idata[0]; 307 | 308 | } 309 | 310 | int main(int argc,char** argv) 311 | { 312 | initDevice(0); 313 | 314 | //initialization 315 | 316 | int size = 1 << 24; 317 | printf(" with array size %d ", size); 318 | 319 | //execution configuration 320 | int blocksize = 1024; 321 | if (argc > 1) 322 | { 323 | blocksize = atoi(argv[1]); //从命令行输入设置block大小 324 | } 325 | dim3 block(blocksize, 1); 326 | dim3 grid((size - 1) / block.x + 1, 1); 327 | printf("grid %d block %d \n", grid.x, block.x); 328 | 329 | //allocate host memory 330 | size_t bytes = size * sizeof(int); 331 | int *idata_host = (int*)malloc(bytes); 332 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 333 | int * tmp = (int*)malloc(bytes); 334 | 335 | //initialize the array 336 | initialData_int(idata_host, size); 337 | 338 | memcpy(tmp, idata_host, bytes); 339 | double timeStart, timeElaps; 340 | int gpu_sum = 0; 341 | 342 | // device memory 343 | int * idata_dev = NULL; 344 | int * odata_dev = NULL; 345 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 346 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 347 | 348 | //cpu reduction 对照组 349 | int cpu_sum = 0; 350 | timeStart = cpuSecond(); 351 | //cpu_sum = recursiveReduce(tmp, size); 352 | for (int i = 0; i < size; i++) 353 | cpu_sum += tmp[i]; 354 | timeElaps = 1000*(cpuSecond() - timeStart); 355 | 356 | printf("cpu sum:%d \n", cpu_sum); 357 | printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum); 358 | 359 | 360 | //kernel 1 reduceNeighbored 361 | 362 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 363 | CHECK(cudaDeviceSynchronize()); 364 | timeStart = cpuSecond(); 365 | reduceNeighbored <<>>(idata_dev, odata_dev, size); 366 | cudaDeviceSynchronize(); 367 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 368 | gpu_sum = 0; 369 | for (int i = 0; i < grid.x; i++) 370 | gpu_sum += odata_host[i]; 371 | timeElaps = 1000*(cpuSecond() - timeStart); 372 | 373 | printf("gpu sum:%d \n", gpu_sum); 374 | printf("gpu reduceNeighbored elapsed %lf ms <<>>\n", 375 | timeElaps, grid.x, block.x); 376 | 377 | //kernel 2 reduceNeighboredless 378 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 379 | CHECK(cudaDeviceSynchronize()); 380 | timeStart = cpuSecond(); 381 | reduceNeighboredLess <<>>(idata_dev, odata_dev, size); 382 | cudaDeviceSynchronize(); 383 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 384 | gpu_sum = 0; 385 | for (int i = 0; i < grid.x; i++) 386 | gpu_sum += odata_host[i]; 387 | timeElaps = 1000*(cpuSecond() - timeStart); 388 | 389 | printf("gpu sum:%d \n", gpu_sum); 390 | printf("gpu reduceNeighboredless elapsed %lf ms <<>>\n", 391 | timeElaps, grid.x, block.x); 392 | 393 | //kernel 3 reduceInterleaved 394 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 395 | CHECK(cudaDeviceSynchronize()); 396 | timeStart = cpuSecond(); 397 | reduceInterleaved <<>>(idata_dev, odata_dev, size); 398 | cudaDeviceSynchronize(); 399 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 400 | gpu_sum = 0; 401 | for (int i = 0; i < grid.x; i++) 402 | gpu_sum += odata_host[i]; 403 | timeElaps = 1000*(cpuSecond() - timeStart); 404 | 405 | printf("gpu sum:%d \n", gpu_sum); 406 | printf("gpu reduceInterleaved elapsed %lf ms <<>>\n", 407 | timeElaps, grid.x, block.x); 408 | 409 | //kernel 4 reduceUnroll2 410 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 411 | CHECK(cudaDeviceSynchronize()); 412 | timeStart = cpuSecond(); 413 | reduceUnroll2 <<>>(idata_dev, odata_dev, size); 414 | cudaDeviceSynchronize(); 415 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 416 | gpu_sum = 0; 417 | for (int i = 0; i < grid.x/2; i++) 418 | gpu_sum += odata_host[i]; 419 | timeElaps = 1000*(cpuSecond() - timeStart); 420 | 421 | printf("gpu sum:%d \n", gpu_sum); 422 | printf("gpu reduceUnroll2 elapsed %lf ms <<>>\n", 423 | timeElaps, grid.x/2, block.x); 424 | 425 | 426 | //kernel 5 reduceUnroll4 427 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 428 | CHECK(cudaDeviceSynchronize()); 429 | timeStart = cpuSecond(); 430 | reduceUnroll4 <<>>(idata_dev, odata_dev, size); 431 | cudaDeviceSynchronize(); 432 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 433 | gpu_sum = 0; 434 | for (int i = 0; i < grid.x/4; i++) 435 | gpu_sum += odata_host[i]; 436 | timeElaps = 1000*(cpuSecond() - timeStart); 437 | 438 | printf("gpu sum:%d \n", gpu_sum); 439 | printf("gpu reduceUnroll4 elapsed %lf ms <<>>\n", 440 | timeElaps, grid.x/4, block.x); 441 | 442 | 443 | //kernel 6 reduceUnroll8 444 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 445 | CHECK(cudaDeviceSynchronize()); 446 | timeStart = cpuSecond(); 447 | reduceUnroll8 <<>>(idata_dev, odata_dev, size); 448 | cudaDeviceSynchronize(); 449 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 450 | gpu_sum = 0; 451 | for (int i = 0; i < grid.x/8; i++) 452 | gpu_sum += odata_host[i]; 453 | timeElaps = 1000*(cpuSecond() - timeStart); 454 | 455 | printf("gpu sum:%d \n", gpu_sum); 456 | printf("gpu reduceUnroll8 elapsed %lf ms <<>>\n", 457 | timeElaps, grid.x/8, block.x); 458 | 459 | 460 | //kernel 7 reduceUnrollWarp8 461 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 462 | CHECK(cudaDeviceSynchronize()); 463 | timeStart = cpuSecond(); 464 | reduceUnrollWarp8 <<>>(idata_dev, odata_dev, size); 465 | cudaDeviceSynchronize(); 466 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 467 | gpu_sum = 0; 468 | for (int i = 0; i < grid.x/8; i++) 469 | gpu_sum += odata_host[i]; 470 | timeElaps = 1000*(cpuSecond() - timeStart); 471 | 472 | printf("gpu sum:%d \n", gpu_sum); 473 | printf("gpu reduceUnrollWarp8 elapsed %lf ms <<>>\n", 474 | timeElaps, grid.x/8, block.x); 475 | 476 | 477 | //kernel 8 reduceCompleteUnrollWarp8 478 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 479 | CHECK(cudaDeviceSynchronize()); 480 | timeStart = cpuSecond(); 481 | reduceCompleteUnrollWarp8 <<>>(idata_dev, odata_dev, size); 482 | cudaDeviceSynchronize(); 483 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 484 | gpu_sum = 0; 485 | for (int i = 0; i < grid.x/8; i++) 486 | gpu_sum += odata_host[i]; 487 | timeElaps = 1000*(cpuSecond() - timeStart); 488 | 489 | printf("gpu sum:%d \n", gpu_sum); 490 | printf("gpu reduceCompleteUnrollWarp8 elapsed %lf ms <<>>\n", 491 | timeElaps, grid.x/8, block.x); 492 | 493 | 494 | 495 | // free host memory 496 | 497 | free(idata_host); 498 | free(odata_host); 499 | CHECK(cudaFree(idata_dev)); 500 | CHECK(cudaFree(odata_dev)); 501 | 502 | //reset device 503 | cudaDeviceReset(); 504 | 505 | //check the results 506 | if (gpu_sum == cpu_sum) 507 | { 508 | printf("Test success!\n"); 509 | } 510 | return EXIT_SUCCESS; 511 | } 512 | -------------------------------------------------------------------------------- /Sum_Matrix/cudastart.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDASTART_H 2 | #define CUDASTART_H 3 | #define CHECK(call)\ 4 | {\ 5 | const cudaError_t error=call;\ 6 | if(error!=cudaSuccess)\ 7 | {\ 8 | printf("ERROR: %s:%d,",__FILE__,__LINE__);\ 9 | printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ 10 | exit(1);\ 11 | }\ 12 | } 13 | 14 | 15 | #include 16 | #ifdef _WIN32 17 | # include 18 | #else 19 | # include 20 | #endif 21 | 22 | double cpuSecond() 23 | { 24 | struct timeval tp; 25 | gettimeofday(&tp,NULL); 26 | return((double)tp.tv_sec+(double)tp.tv_usec*1e-6); 27 | 28 | } 29 | 30 | void initialData(float* ip,int size) 31 | { 32 | time_t t; 33 | srand((unsigned )time(&t)); 34 | for(int i=0;iepsilon) 55 | { 56 | printf("Results don\'t match!\n"); 57 | printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i); 58 | return; 59 | } 60 | } 61 | printf("Check result success!\n"); 62 | } 63 | 64 | #endif 65 | 66 | -------------------------------------------------------------------------------- /Sum_Matrix/readme.md: -------------------------------------------------------------------------------- 1 | 2 | https://zhuanlan.zhihu.com/p/97192227 3 | -------------------------------------------------------------------------------- /Sum_Matrix/sum_martix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cudastart.h" 4 | 5 | 6 | //CPU对照组,用于对比加速比 7 | void sumMatrix2DonCPU(float * MatA,float * MatB,float * MatC,int nx,int ny) 8 | { 9 | float* a = MatA; 10 | float* b = MatB; 11 | float* c = MatC; 12 | for(int j=0; j>>(A_dev, B_dev, C_dev, nx, ny); 77 | CHECK(cudaDeviceSynchronize()); 78 | double gpuTime = cpuSecond() - gpuStart; 79 | printf("GPU Execution Time: %f sec\n", gpuTime); 80 | 81 | //在CPU上完成相同的任务 82 | cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost); 83 | double cpuStart=cpuSecond(); 84 | sumMatrix2DonCPU(A_host, B_host, C_host, nx, ny); 85 | double cpuTime = cpuSecond() - cpuStart; 86 | printf("CPU Execution Time: %f sec\n", cpuTime); 87 | 88 | //检查GPU与CPU计算结果是否相同 89 | CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost)); 90 | checkResult(C_host, C_from_gpu, nx*ny); 91 | 92 | cudaFree(A_dev); 93 | cudaFree(B_dev); 94 | cudaFree(C_dev); 95 | free(A_host); 96 | free(B_host); 97 | free(C_host); 98 | free(C_from_gpu); 99 | cudaDeviceReset(); 100 | return 0; 101 | } 102 | --------------------------------------------------------------------------------