├── .gitignore ├── adder ├── CMakeLists.txt └── main.cu ├── atomic ├── .vscode │ └── launch.json ├── CMakeLists.txt └── main.cu ├── basis ├── CMakeLists.txt └── main.cu ├── conv ├── .vscode │ └── launch.json ├── CMakeLists.txt └── main.cu ├── dot_product ├── CMakeLists.txt └── main.cu ├── generate.py ├── mallocPitch ├── CMakeLists.txt └── main.cu ├── matrix_multiplication ├── CMakeLists.txt └── main.cu ├── pi ├── .vscode │ └── launch.json ├── CMakeLists.txt └── main.cu ├── shared ├── CMakeLists.txt └── main.cu ├── sum ├── CMakeLists.txt └── main.cu ├── thrust ├── CMakeLists.txt └── main.cu └── time ├── .vscode └── launch.json ├── CMakeLists.txt └── main.cu /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .vscode 3 | -------------------------------------------------------------------------------- /adder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(adder) 3 | SET(CMAKE_CXX_FLAGS "-std=c++11") 4 | FIND_PACKAGE(CUDA REQUIRED) 5 | CUDA_ADD_EXECUTABLE(adder main.cu) 6 | TARGET_LINK_LIBRARIES(adder) -------------------------------------------------------------------------------- /adder/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | typedef bool TYPE; 7 | bool single_adder(TYPE A, TYPE B, TYPE& Ci, TYPE& Si) 8 | { 9 | Ci = A & B; // 与门表示进位 10 | Si = A ^ B; // 异或门表示和 11 | printf("%d + %d = %d, Ci: %d\n",A,B,Si,Ci); 12 | return 1; 13 | } 14 | bool multi_adder(TYPE A, TYPE B, TYPE Ci0, TYPE& Ci, TYPE& Si) 15 | { 16 | Si = A ^ B ^ Ci0; // 异或门表示和 17 | Ci = A & B; // 进位产生信号 18 | Ci = ((A ^ B) & Ci0) | Ci; // 进位传递信号 19 | // Ci = ((A | B) & Ci0) | Ci; // 直接考虑AB其中一个是1且传递的进位符也是1的情况决定是否进位 20 | printf("%d + %d + %d = %d, Ci: %d\n",A,B,Ci0,Si,Ci); 21 | return 1; 22 | } 23 | 24 | __global__ void full_adder(TYPE *num1, TYPE *num2, TYPE *result, TYPE* Ci0_, int startIdx, int length) 25 | { 26 | int index = blockIdx.x * blockDim.x + threadIdx.x; 27 | if(index >= length) 28 | { 29 | return; 30 | } 31 | TYPE Ci0 = Ci0_[0]; 32 | TYPE Ci = 0; 33 | for(int i = 0; i < index; i++) 34 | { 35 | TYPE A = num1[i + startIdx]; 36 | TYPE B = num2[i + startIdx]; 37 | Ci = A & B; 38 | Ci = ((A ^ B) & Ci0) | Ci; // 进位传递信号 39 | Ci0 = Ci; 40 | } 41 | int i = index; 42 | TYPE A = num1[i + startIdx]; 43 | TYPE B = num2[i + startIdx]; 44 | TYPE Si = A ^ B ^ Ci0; // 异或门表示和 45 | result[index + startIdx] = Si; 46 | if(index == length - 1) 47 | { 48 | Ci = A & B; 49 | Ci = ((A ^ B) & Ci0) | Ci; // 进位传递信号 50 | Ci0_[0] = Ci; 51 | } 52 | } 53 | int convert_string_to_array(std::string num1, std::string num2, TYPE* &A, TYPE* &B) 54 | { 55 | // 如果他俩不一样长就给短的补0 56 | int zero_num = num1.length() - num2.length(); 57 | std::string* temp; 58 | if(zero_num > 0) 59 | { 60 | temp = &num2; 61 | } 62 | else if(zero_num < 0) 63 | { 64 | temp = &num1; 65 | } 66 | for(int i = 0; i < abs(zero_num); i++) 67 | { 68 | *temp = "0" + *temp; 69 | } 70 | int length = num1.length(); 71 | A = new TYPE[length]; 72 | B = new TYPE[length]; 73 | for(int i = 0; i < length; i++) 74 | { 75 | A[i] = num1[length - 1 - i] - '0'; 76 | B[i] = num2[length - 1 - i] - '0'; 77 | } 78 | return length; 79 | } 80 | int main() 81 | { 82 | std::string num1 = "10101"; 83 | std::string num2 = "11111"; 84 | TYPE *ACpu; 85 | TYPE *BCpu; 86 | int length = convert_string_to_array(num1, num2, ACpu, BCpu); 87 | for(int i = 0; i < length; i++) 88 | { 89 | printf("%d ", ACpu[i]); 90 | } 91 | printf("\n"); 92 | 93 | for(int i = 0; i < length; i++) 94 | { 95 | printf("%d ", BCpu[i]); 96 | } 97 | printf("\n"); 98 | 99 | TYPE *AGpu; 100 | cudaMalloc((void**)&AGpu, length * sizeof(TYPE)); 101 | cudaMemcpy(AGpu, ACpu, length * sizeof(TYPE), cudaMemcpyHostToDevice); 102 | TYPE *BGpu; 103 | cudaMalloc((void**)&BGpu, length * sizeof(TYPE)); 104 | cudaMemcpy(BGpu, BCpu, length * sizeof(TYPE), cudaMemcpyHostToDevice); 105 | 106 | TYPE *resultGpu; 107 | cudaMalloc((void**)&resultGpu, length * sizeof(TYPE)); 108 | 109 | TYPE *CiCpu = new TYPE[1]; 110 | CiCpu = 0; 111 | TYPE *CiGpu; 112 | cudaMalloc((void**)&CiGpu, 1 * sizeof(TYPE)); 113 | cudaMemcpy(CiCpu, CiGpu, 1 * sizeof(TYPE), cudaMemcpyHostToDevice); 114 | 115 | 116 | int threadNum = 1; 117 | int blockNum = 4; 118 | int totalNum = threadNum * blockNum; 119 | for(int i = 0; totalNum * i < length; i++) 120 | { 121 | printf("i: %d, i2: %d\n", totalNum * i, min(length - totalNum * i, totalNum)); 122 | full_adder<<> >(AGpu, BGpu, resultGpu, CiGpu, totalNum * i, min(length - totalNum * i, totalNum)); 123 | } 124 | 125 | TYPE *result = new TYPE[length]; 126 | cudaMemcpy(result, resultGpu, length * sizeof(TYPE), cudaMemcpyDeviceToHost); 127 | for(int i = 0; i < length; i++) 128 | { 129 | printf("%d ", result[length - 1 - i]); 130 | } 131 | printf("\n"); 132 | return 0; 133 | } -------------------------------------------------------------------------------- /atomic/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "(gdb) Launch", 9 | "type": "cppdbg", 10 | "request": "launch", 11 | "program": "enter program name, for example ${workspaceFolder}/a.out", 12 | "args": [], 13 | "stopAtEntry": false, 14 | "cwd": "${workspaceFolder}", 15 | "environment": [], 16 | "externalConsole": false, 17 | "MIMode": "gdb", 18 | "setupCommands": [ 19 | { 20 | "description": "Enable pretty-printing for gdb", 21 | "text": "-enable-pretty-printing", 22 | "ignoreFailures": true 23 | } 24 | ] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /atomic/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(sum) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(sum main.cu) 5 | TARGET_LINK_LIBRARIES(sum) -------------------------------------------------------------------------------- /atomic/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | __global__ void sum(float *a, float *b) 6 | { 7 | int tid = threadIdx.x; 8 | 9 | __shared__ float sData[16]; 10 | sData[tid] = a[tid]; 11 | __syncthreads(); 12 | for(int i = 8; i > 0; i /= 2) 13 | { 14 | if(tid < i) 15 | { 16 | sData[tid] = sData[tid] + sData[tid + i]; 17 | } 18 | __syncthreads(); 19 | } 20 | if(tid == 0) 21 | { 22 | b[0] = sData[0]; 23 | } 24 | } 25 | 26 | __global__ void get_hist(float *a, int *hist) 27 | { 28 | int tid = threadIdx.x; 29 | int bid = blockIdx.x; 30 | int idx = tid + bid * blockDim.x; 31 | 32 | atomicAdd(&hist[(int)a[idx]], 1); 33 | } 34 | 35 | int main() 36 | { 37 | int size = 32000000; 38 | float *a = new float[size]; 39 | 40 | int length = 10; 41 | for(int i = 0; i < size; i++) 42 | { 43 | a[i] = i*(i+1) % length; 44 | } 45 | 46 | int hist[length] = {0}; 47 | 48 | float *aGpu; 49 | cudaMalloc((void**)&aGpu, size * sizeof(float)); 50 | cudaMemcpy(aGpu, a, size * sizeof(float), cudaMemcpyHostToDevice); 51 | 52 | int *histGpu; 53 | cudaMalloc((void**)&histGpu, length * sizeof(int)); 54 | cudaMemcpy(histGpu, hist, length * sizeof(int), cudaMemcpyHostToDevice); 55 | 56 | struct timeval startTime, endTime; 57 | gettimeofday(&startTime, NULL); 58 | // get_hist<<<1, size>> >(aGpu, histGpu); 59 | get_hist<<> >(aGpu, histGpu); 60 | gettimeofday(&endTime, NULL); 61 | printf("cuda use time: %d\n", 62 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 63 | 64 | gettimeofday(&startTime, NULL); 65 | for(int i = 0; i < size; i++) 66 | { 67 | hist[(int)a[i]] += 1; 68 | } 69 | gettimeofday(&endTime, NULL); 70 | printf("cpu use time: %d\n", 71 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 72 | 73 | // printf("\ncpu:\n"); 74 | // for(int i = 0; i < length; i++) 75 | // { 76 | // printf("%.6d ",hist[i]); 77 | // } 78 | 79 | // cudaMemcpy(hist, histGpu, length * sizeof(int), cudaMemcpyDeviceToHost); 80 | // printf("\ngpu:\n"); 81 | // for(int i = 0; i < length; i++) 82 | // { 83 | // printf("%.6d ",hist[i]); 84 | // } 85 | 86 | return 0; 87 | } -------------------------------------------------------------------------------- /basis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(basis) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(basis main.cu) 5 | TARGET_LINK_LIBRARIES(basis) -------------------------------------------------------------------------------- /basis/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void add(int* a, int* b, int* c, int num) 5 | { 6 | int i = threadIdx.x; 7 | if(i < num) 8 | { 9 | c[i] = a[i] + b[i]; 10 | } 11 | } 12 | 13 | int main(void) 14 | { 15 | // init data 16 | int num = 10; 17 | int a[num], b[num], c[num]; 18 | int *a_gpu, *b_gpu, *c_gpu; 19 | 20 | for(int i = 0; i < num; i++) 21 | { 22 | a[i] = i; 23 | b[i] = i * i; 24 | } 25 | 26 | cudaMalloc((void **)&a_gpu, num * sizeof(int)); 27 | cudaMalloc((void **)&b_gpu, num * sizeof(int)); 28 | cudaMalloc((void **)&c_gpu, num * sizeof(int)); 29 | 30 | // copy data 31 | cudaMemcpy(a_gpu, a, num * sizeof(int), cudaMemcpyHostToDevice); 32 | cudaMemcpy(b_gpu, b, num * sizeof(int), cudaMemcpyHostToDevice); 33 | 34 | 35 | // get data 36 | cudaMemcpy(c, c_gpu, num * sizeof(int), cudaMemcpyDeviceToHost); 37 | 38 | // visualization 39 | for(int i = 0; i < num; i++) 40 | { 41 | printf("%d + %d = %d\n", a[i], b[i], c[i]); 42 | } 43 | 44 | return 0; 45 | } -------------------------------------------------------------------------------- /conv/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "(gdb) Launch", 9 | "type": "cppdbg", 10 | "request": "launch", 11 | "program": "${workspaceFolder}/build/conv", 12 | "args": [], 13 | "stopAtEntry": false, 14 | "cwd": "${workspaceFolder}", 15 | "environment": [], 16 | "externalConsole": false, 17 | "MIMode": "gdb", 18 | "setupCommands": [ 19 | { 20 | "description": "Enable pretty-printing for gdb", 21 | "text": "-enable-pretty-printing", 22 | "ignoreFailures": true 23 | } 24 | ] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /conv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(conv) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(conv main.cu) 5 | TARGET_LINK_LIBRARIES(conv) -------------------------------------------------------------------------------- /conv/main.cu: -------------------------------------------------------------------------------- 1 | #include "stdio.h" 2 | 3 | static void HandleError(cudaError_t err, 4 | const char *file, 5 | int line) 6 | { 7 | if(err != cudaSuccess) 8 | { 9 | printf("%s in %s at line %d\n", 10 | cudaGetErrorString(err), 11 | file, line); 12 | exit(EXIT_FAILURE); 13 | } 14 | } 15 | #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) 16 | 17 | int getThreadNum() 18 | { 19 | cudaDeviceProp prop; 20 | int count; 21 | 22 | HANDLE_ERROR(cudaGetDeviceCount(&count)); 23 | printf("gpu num %d\n", count); 24 | HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0)); 25 | printf("max thread num: %d\n", prop.maxThreadsPerBlock); 26 | printf("max grid dimensions: %d, %d, %d)\n", 27 | prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); 28 | return prop.maxThreadsPerBlock; 29 | } 30 | 31 | __global__ void conv(float *img, float *kernel, float *result, 32 | int width, int height, int kernelSize) 33 | { 34 | int ti = threadIdx.x; 35 | int bi = blockIdx.x; 36 | int id = (bi * blockDim.x + ti); 37 | if(id >= width * height) 38 | { 39 | return; 40 | } 41 | int row = id / width; 42 | int col = id % width; 43 | for(int i = 0; i < kernelSize; ++i) 44 | { 45 | for(int j = 0; j < kernelSize; ++j) 46 | { 47 | float imgValue = 0; 48 | int curRow = row - kernelSize / 2 + i; 49 | int curCol = col - kernelSize / 2 + j; 50 | if(curRow < 0 || curCol < 0 || curRow >= height || curCol >= width) 51 | {} 52 | else 53 | { 54 | imgValue = img[curRow * width + curCol]; 55 | } 56 | result[id] += kernel[i * kernelSize + j] * imgValue; 57 | } 58 | 59 | } 60 | } 61 | 62 | int main() 63 | { 64 | int width = 1920; 65 | int height = 1080; 66 | float *img = new float[width * height]; 67 | for(int row = 0; row < height; ++row) 68 | { 69 | for(int col = 0; col < width; ++col) 70 | { 71 | img[col + row * width] = (col + row) % 256; 72 | } 73 | } 74 | 75 | int kernelSize = 3; 76 | float *kernel = new float[kernelSize * kernelSize]; 77 | for(int i = 0; i < kernelSize * kernelSize; ++i) 78 | { 79 | kernel[i] = i % kernelSize - 1; 80 | } 81 | 82 | float *imgGpu; 83 | float *kernelGpu; 84 | float *resultGpu; 85 | 86 | HANDLE_ERROR(cudaMalloc((void**)&imgGpu, width * height * sizeof(float))); 87 | HANDLE_ERROR(cudaMalloc((void**)&kernelGpu, kernelSize * kernelSize * sizeof(float))); 88 | HANDLE_ERROR(cudaMalloc((void**)&resultGpu, width * height * sizeof(float))); 89 | 90 | HANDLE_ERROR(cudaMemcpy(imgGpu, img, 91 | width * height * sizeof(float), cudaMemcpyHostToDevice)); 92 | HANDLE_ERROR(cudaMemcpy(kernelGpu, kernel, 93 | kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice)); 94 | 95 | int threadNum = getThreadNum(); 96 | int blockNum = (width * height - 0.5) / threadNum + 1; 97 | 98 | conv<<> > 99 | (imgGpu, kernelGpu, resultGpu, width, height, kernelSize); 100 | 101 | float *result = new float[width * height]; 102 | HANDLE_ERROR(cudaMemcpy(result, resultGpu, 103 | width * height * sizeof(float), cudaMemcpyDeviceToHost)); 104 | 105 | // visualization 106 | printf("img\n"); 107 | for(int row = 0; row < 10; ++row) 108 | { 109 | for(int col = 0; col < 10; ++col) 110 | { 111 | printf("%2.0f ", img[col + row * width]); 112 | } 113 | printf("\n"); 114 | } 115 | printf("kernel\n"); 116 | for(int row = 0; row < kernelSize; ++row) 117 | { 118 | for(int col = 0; col < kernelSize; ++col) 119 | { 120 | printf("%2.0f ", kernel[col + row * kernelSize]); 121 | } 122 | printf("\n"); 123 | } 124 | 125 | printf("result\n"); 126 | for(int row = 0; row < 10; ++row) 127 | { 128 | for(int col = 0; col < 10; ++col) 129 | { 130 | printf("%2.0f ", result[col + row * width]); 131 | } 132 | printf("\n"); 133 | } 134 | 135 | 136 | return 0; 137 | } -------------------------------------------------------------------------------- /dot_product/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(dot_product) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(dot_product main.cu) 5 | TARGET_LINK_LIBRARIES(dot_product) -------------------------------------------------------------------------------- /dot_product/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define LENGTH 16 4 | #define BLOCKNUM 2 5 | #define THREADNUM 4 6 | __global__ void dot_product(float *a, float *b, float* r) 7 | { 8 | int tid = threadIdx.x; 9 | int bid = blockIdx.x; 10 | int total_thread_num = THREADNUM * BLOCKNUM; 11 | 12 | __shared__ float sData[THREADNUM]; 13 | int global_id = tid + bid * THREADNUM; 14 | sData[tid] = 0; 15 | while(global_id < LENGTH) 16 | { 17 | sData[tid] += a[global_id] * b[global_id]; 18 | global_id += total_thread_num; 19 | } 20 | __syncthreads(); 21 | for(int i = THREADNUM/2; i > 0; i /= 2) 22 | { 23 | if(tid < i) 24 | { 25 | sData[tid] = sData[tid] + sData[tid + i]; 26 | } 27 | __syncthreads(); 28 | } 29 | if(tid == 0) 30 | { 31 | r[bid] = sData[0]; 32 | } 33 | } 34 | 35 | int main() 36 | { 37 | float a[LENGTH]; 38 | float b[LENGTH]; 39 | for(int i = 0; i < LENGTH; i++) 40 | { 41 | a[i] = i*(i+1); 42 | b[i] = i*(i-2); 43 | } 44 | float *aGpu; 45 | cudaMalloc((void**)&aGpu, LENGTH * sizeof(float)); 46 | cudaMemcpy(aGpu, a, LENGTH * sizeof(float), cudaMemcpyHostToDevice); 47 | 48 | float *bGpu; 49 | cudaMalloc((void**)&bGpu, LENGTH * sizeof(float)); 50 | cudaMemcpy(bGpu, b, LENGTH * sizeof(float), cudaMemcpyHostToDevice); 51 | 52 | float *rGpu; 53 | cudaMalloc((void**)&rGpu, BLOCKNUM * sizeof(float)); 54 | dot_product<<> >(aGpu, bGpu, rGpu); 55 | 56 | float r[BLOCKNUM]; 57 | cudaMemcpy(r, rGpu, BLOCKNUM * sizeof(float), cudaMemcpyDeviceToHost); 58 | 59 | float result = 0; 60 | for(int i = 0; i < BLOCKNUM; i++) 61 | { 62 | result += r[i]; 63 | } 64 | printf("result: %f\n",result); 65 | return 0; 66 | } -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import os 3 | import sys 4 | if len(sys.argv) < 2: 5 | print('please input a project name!') 6 | exit(0) 7 | project = sys.argv[1] 8 | os.mkdir(project) 9 | os.mkdir(os.path.join(project,'build')) 10 | file = open(os.path.join(project, 'CMakeLists.txt'), 'w') 11 | file.write('CMAKE_MINIMUM_REQUIRED(VERSION 2.8)\n') 12 | file.write('PROJECT(%s)\n'%project) 13 | file.write('FIND_PACKAGE(CUDA REQUIRED)\n') 14 | file.write('CUDA_ADD_EXECUTABLE(%s main.cu)\n'%project) 15 | file.write('TARGET_LINK_LIBRARIES(%s)'%project) 16 | file.close() -------------------------------------------------------------------------------- /mallocPitch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(mallocPitch) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(mallocPitch main.cu) 5 | TARGET_LINK_LIBRARIES(mallocPitch) -------------------------------------------------------------------------------- /mallocPitch/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | __global__ void kernel(float * d_matrix, size_t pitch, size_t rows, size_t cols) { 7 | int count = 1; 8 | for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < rows; j += blockDim.y * gridDim.y) 9 | { 10 | float* row_d_matrix = (float*)((char*)d_matrix + j*pitch); 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cols; i += blockDim.x * gridDim.x) 12 | { 13 | row_d_matrix[i] = count; 14 | count++; 15 | } 16 | } 17 | // d_matrix[0] = 1; 18 | } 19 | 20 | int main(int argc, char **argv) 21 | { 22 | // device pointers. 23 | float *d_pitch; 24 | float *d_normal; 25 | 26 | // matrix size. 27 | size_t cols = 128; 28 | size_t rows = 16; 29 | 30 | size_t pitch = 0; 31 | 32 | // alloc the data form gpu memory. 33 | cudaMallocPitch((void**)&d_pitch, &pitch, cols*sizeof(float), rows); 34 | cudaMalloc((void**)(&d_normal), rows*cols*sizeof(float)); 35 | 36 | // test the data address. 37 | fprintf(stdout, "row size(in bytes) = %.2f*128.\n", pitch/128.0f); 38 | std::cout<<"d_pitch:"<>>(d_matrix, pitch, rows, cols); 54 | // cudaMemcpy2D(dc_matrix, cols * sizeof(float), d_matrix, pitch, cols * sizeof(float), rows, cudaMemcpyDeviceToHost); 55 | cudaMemcpy(dc_matrix, d_matrix, rows*cols * sizeof(float), cudaMemcpyDeviceToHost); 56 | int count = 0; 57 | for(int i = 0; i < rows; i++) 58 | { 59 | for (int j = 0; j < cols; j++) 60 | { 61 | std::cout< 3 | #include 4 | typedef int DTYPE; 5 | 6 | void matrix_multiplication_serial_1(DTYPE* a, DTYPE* b, DTYPE* c, int m, int n, int l) 7 | { 8 | for(int i = 0; i < m; i++) 9 | { 10 | for(int j = 0; j < n; j++) 11 | { 12 | DTYPE temp = 0; 13 | for(int k = 0; k < l; k++) 14 | { 15 | temp += a[i*l+k] * b[k*n+j]; 16 | } 17 | c[i*n+j] = temp; 18 | } 19 | } 20 | } 21 | 22 | void matrix_multiplication_serial_2(DTYPE* a, DTYPE* b, DTYPE* c, int m, int n, int l) 23 | { 24 | //init c 25 | for(int row = 0; row < m; ++row) 26 | { 27 | for(int col = 0; col < n; ++col) 28 | { 29 | c[col + row * n] = 0; 30 | } 31 | } 32 | for(int i = 0; i < m; i++) 33 | { 34 | for(int k = 0; k < l; k++) 35 | { 36 | for(int j = 0; j < n; j++) 37 | { 38 | c[i*n+j] += a[i*l+k] * b[k*n+j]; 39 | } 40 | } 41 | } 42 | } 43 | 44 | void matrix_multiplication_serial_3(DTYPE* a, DTYPE* b, DTYPE* c, int m, int n, int l) 45 | { 46 | //transform b 47 | DTYPE* b1 = new DTYPE[n * l]; 48 | for(int row = 0; row < l; ++row) 49 | { 50 | for(int col = 0; col < n; ++col) 51 | { 52 | b1[col + row * n] = b[col + row * n]; 53 | } 54 | } 55 | for(int i = 0; i < m; i++) 56 | { 57 | for(int j = 0; j < n; j++) 58 | { 59 | c[i*n+j] = 0; 60 | for(int k = 0; k < l; k++) 61 | { 62 | c[i*n+j] += a[i*l+k] * b[k*n+j]; 63 | } 64 | } 65 | } 66 | } 67 | 68 | int main() 69 | { 70 | int m = 10; 71 | int l = 20; 72 | int n = 30; 73 | // a:m*l, b:l*n, c:m*n 74 | DTYPE *a = new DTYPE[m * l]; 75 | DTYPE *b = new DTYPE[l * n]; 76 | DTYPE *c = new DTYPE[m * n]; 77 | // init a 78 | printf("a:\n"); 79 | for(int row = 0; row < m; ++row) 80 | { 81 | for(int col = 0; col < l; ++col) 82 | { 83 | a[col + row * l] = (col + row) % 256; 84 | printf("%3d ",a[col + row * l]); 85 | } 86 | printf("\n"); 87 | } 88 | //init b 89 | printf("b:\n"); 90 | for(int row = 0; row < l; ++row) 91 | { 92 | for(int col = 0; col < n; ++col) 93 | { 94 | b[col + row * n] = (col * 2 + row + 3) % 256; 95 | printf("%3d ",b[col + row * n]); 96 | } 97 | printf("\n"); 98 | } 99 | 100 | struct timeval startTime, endTime; 101 | gettimeofday(&startTime, NULL); 102 | int loopNum = 10000; 103 | for(int i = 0; i < loopNum; i++) 104 | { 105 | matrix_multiplication_serial_1(a, b, c, m, n, l); 106 | } 107 | gettimeofday(&endTime, NULL); 108 | printf("matrix_multiplication_serial_1 use time: %d\n", 109 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 110 | 111 | gettimeofday(&startTime, NULL); 112 | for(int i = 0; i < loopNum; i++) 113 | { 114 | matrix_multiplication_serial_2(a, b, c, m, n, l); 115 | } 116 | gettimeofday(&endTime, NULL); 117 | printf("matrix_multiplication_serial_2 use time: %d\n", 118 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 119 | 120 | gettimeofday(&startTime, NULL); 121 | for(int i = 0; i < loopNum; i++) 122 | { 123 | matrix_multiplication_serial_3(a, b, c, m, n, l); 124 | } 125 | gettimeofday(&endTime, NULL); 126 | printf("matrix_multiplication_serial_3 use time: %d\n", 127 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 128 | 129 | //result 130 | printf("result:\n"); 131 | for(int row = 0; row < m; ++row) 132 | { 133 | for(int col = 0; col < n; ++col) 134 | { 135 | printf("%5d ",c[col + row * n]); 136 | } 137 | printf("\n"); 138 | } 139 | return 0; 140 | } 141 | -------------------------------------------------------------------------------- /pi/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "(gdb) Launch", 9 | "type": "cppdbg", 10 | "request": "launch", 11 | "program": "enter program name, for example ${workspaceFolder}/a.out", 12 | "args": [], 13 | "stopAtEntry": false, 14 | "cwd": "${workspaceFolder}", 15 | "environment": [], 16 | "externalConsole": false, 17 | "MIMode": "gdb", 18 | "setupCommands": [ 19 | { 20 | "description": "Enable pretty-printing for gdb", 21 | "text": "-enable-pretty-printing", 22 | "ignoreFailures": true 23 | } 24 | ] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /pi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(sum) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(sum main.cu) 5 | TARGET_LINK_LIBRARIES(sum) -------------------------------------------------------------------------------- /pi/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define CUDA_KERNEL_LOOP(i, n) \ 4 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 5 | i < (n); \ 6 | i += blockDim.x * gridDim.x) 7 | 8 | __global__ void sum(int *a, int *b, int num) 9 | { 10 | int tid = threadIdx.x; 11 | b[0] = 0; 12 | __shared__ float sData[512]; 13 | for(int count = 0; count < ceilf(num / 512) ; count++) 14 | { 15 | if(tid + count * 512 < num) 16 | { 17 | sData[tid] = a[tid + count * 512]; 18 | __syncthreads(); 19 | } 20 | for(int i = 256; i > 0; i /= 2) 21 | { 22 | if(tid < i && tid + count * 512 < num) 23 | { 24 | sData[tid] = sData[tid] + sData[tid + i]; 25 | } 26 | __syncthreads(); 27 | } 28 | if(tid == 0) 29 | { 30 | b[0] += sData[0]; 31 | } 32 | } 33 | } 34 | 35 | __global__ void distance(float *x, float *y, int *result, int num) 36 | { 37 | CUDA_KERNEL_LOOP(index, num) 38 | { 39 | // if(index < num) 40 | { 41 | float temp = (x[index] - 1) * (x[index] - 1) + (y[index] - 1) * (y[index] - 1); 42 | if(temp < 1) 43 | { 44 | result[index] = 1; 45 | } 46 | else 47 | { 48 | result[index] = 0; 49 | } 50 | } 51 | } 52 | } 53 | 54 | int main() 55 | { 56 | int testNum = 100000000; 57 | srand((int)time(0)); 58 | float *xSquare = new float[testNum]; 59 | float *ySquare = new float[testNum]; 60 | 61 | for(int i = 0; i < testNum; i++) 62 | { 63 | xSquare[i] = rand()%10000 * 1.0 / 10000; 64 | ySquare[i] = rand()%10000 * 1.0 / 10000; 65 | } 66 | float *xSquareGpu; 67 | cudaMalloc((void**)&xSquareGpu, testNum * sizeof(float)); 68 | cudaMemcpy(xSquareGpu, xSquare, testNum * sizeof(float), cudaMemcpyHostToDevice); 69 | 70 | float *ySquareGpu; 71 | cudaMalloc((void**)&ySquareGpu, testNum * sizeof(float)); 72 | cudaMemcpy(ySquareGpu, ySquare, testNum * sizeof(float), cudaMemcpyHostToDevice); 73 | 74 | int threadNum = 1024; 75 | int blockNum = 512; 76 | int *resultGpu; 77 | cudaMalloc((void**)&resultGpu, testNum * sizeof(int)); 78 | distance<<> >(xSquareGpu, ySquareGpu, resultGpu, testNum); 79 | int *result = new int[testNum]; 80 | cudaMemcpy(result, resultGpu, testNum * sizeof(int), cudaMemcpyDeviceToHost); 81 | for(int i = 0; i < 10; i++) 82 | { 83 | printf("(%f, %f) -> %d\n", 1.0 - xSquare[i], 1.0 - ySquare[i], result[i]); 84 | } 85 | int *bGpu; 86 | cudaMalloc((void**)&bGpu, 1 * sizeof(int)); 87 | sum<<<1, 512>> >(resultGpu, bGpu, testNum); 88 | 89 | int b[1]; 90 | cudaMemcpy(b, bGpu, 1 * sizeof(int), cudaMemcpyDeviceToHost); 91 | printf("b: %d\n",b[0]); 92 | printf("PI: %f\n",b[0] * 4.0 / testNum); 93 | return 0; 94 | } -------------------------------------------------------------------------------- /shared/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(basis) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | #SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 5 | #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=sm_61;-std=c++11;) 6 | CUDA_ADD_EXECUTABLE(basis main.cu) 7 | TARGET_LINK_LIBRARIES(basis) -------------------------------------------------------------------------------- /shared/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #define N 10 3 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 4 | static void HandleError( cudaError_t err, 5 | const char *file, 6 | int line ) { 7 | if (err != cudaSuccess) { 8 | printf( "%s in %s at line %d\n", cudaGetErrorString( err ), 9 | file, line ); 10 | exit( EXIT_FAILURE ); 11 | } 12 | } 13 | __global__ void add( int *a, int *b, int *c ) { 14 | int tid = threadIdx.x; // this thread handles the data at its thread id 15 | extern __shared__ int sh[]; 16 | int *x = (int *)sh; 17 | if (tid >= N) 18 | return; 19 | x[tid] = a[tid] + b[tid]; 20 | c[tid] = x[tid]; 21 | } 22 | 23 | int main( void ) { 24 | int a[N], b[N], c[N]; 25 | int *dev_a, *dev_b, *dev_c; 26 | 27 | // allocate the memory on the GPU 28 | HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) ); 29 | HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ); 30 | HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ); 31 | 32 | // fill the arrays 'a' and 'b' on the CPU 33 | for (int i=0; i>>( dev_a, dev_b, dev_c ); 45 | 46 | // copy the array 'c' back from the GPU to the CPU 47 | HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int), 48 | cudaMemcpyDeviceToHost ) ); 49 | 50 | // display the results 51 | for (int i=0; i 2 | 3 | __global__ void sum(float *a, float *b) 4 | { 5 | int tid = threadIdx.x; 6 | 7 | __shared__ float sData[16]; 8 | sData[tid] = a[tid]; 9 | __syncthreads(); 10 | for(int i = 8; i > 0; i /= 2) 11 | { 12 | if(tid < i) 13 | { 14 | sData[tid] = sData[tid] + sData[tid + i]; 15 | } 16 | __syncthreads(); 17 | } 18 | if(tid == 0) 19 | { 20 | b[0] = sData[0]; 21 | } 22 | } 23 | 24 | int main() 25 | { 26 | float a[16]; 27 | for(int i = 0; i < 16; i++) 28 | { 29 | a[i] = i*(i+1); 30 | } 31 | float *aGpu; 32 | cudaMalloc((void**)&aGpu, 16 * sizeof(float)); 33 | cudaMemcpy(aGpu, a, 16 * sizeof(float), cudaMemcpyHostToDevice); 34 | 35 | float *bGpu; 36 | cudaMalloc((void**)&bGpu, 1 * sizeof(float)); 37 | sum<<<1, 16>> >(aGpu, bGpu); 38 | 39 | float b[1]; 40 | cudaMemcpy(b, bGpu, 1 * sizeof(float), cudaMemcpyDeviceToHost); 41 | printf("b: %f\n",b[0]); 42 | return 0; 43 | } -------------------------------------------------------------------------------- /thrust/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(thrust) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(thrust main.cu) 5 | TARGET_LINK_LIBRARIES(thrust) -------------------------------------------------------------------------------- /thrust/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | struct saxpy_functor 7 | { 8 | const float a; 9 | 10 | saxpy_functor(float _a) : a(_a) {} 11 | 12 | __host__ __device__ 13 | float operator()(const float& x, const float& y) const 14 | { 15 | return y * x + a; 16 | } 17 | }; 18 | 19 | void saxpy_fast(float A, thrust::device_vector& X, thrust::device_vector& Y) 20 | { 21 | // Y <- A * X + Y 22 | thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A)); 23 | } 24 | 25 | void saxpy_slow(float A, thrust::device_vector& X, thrust::device_vector& Y) 26 | { 27 | thrust::device_vector temp(X.size()); 28 | 29 | // temp <- A 30 | thrust::fill(temp.begin(), temp.end(), A); 31 | 32 | // temp <- A * X 33 | thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(), thrust::multiplies()); 34 | 35 | // Y <- A * X + Y 36 | thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(), thrust::plus()); 37 | } 38 | 39 | int main(void) 40 | { 41 | thrust::host_vector H(4); 42 | std::vector I(4); 43 | 44 | // initialize individual elements 45 | H[0] = 14; 46 | H[1] = 20; 47 | H[2] = 38; 48 | H[3] = 46; 49 | // H.size() returns the size of vector H 50 | std::cout << "H has size " << H.size() << std::endl; 51 | 52 | // print contents of H 53 | for(int i = 0; i < H.size(); i++) 54 | { 55 | std::cout << "H[" << i << "] = " << H[i] << std::endl; 56 | } 57 | // Copy host_vector H to device_vector D 58 | thrust::device_vector D = H; 59 | // print contents of D 60 | for(int i = 0; i < D.size(); i++) 61 | { 62 | std::cout << "D[" << i << "] = " << D[i] << std::endl; 63 | } 64 | // elements of D can be modified 65 | D[0] = 99; 66 | D[1] = 88; 67 | std::cout<< "D values are changed!" << std::endl; 68 | for(int i = 0; i < D.size(); i++) 69 | { 70 | std::cout << "D[" << i << "] = " << D[i] << std::endl; 71 | } 72 | // print contents of H 73 | for(int i = 0; i < H.size(); i++) 74 | { 75 | std::cout << "H[" << i << "] = " << H[i] << std::endl; 76 | } 77 | // resize H 78 | H.resize(5); 79 | std::cout << "H now has size " << H.size() << std::endl; 80 | // set the elements of H to 0, 1, 2, 3, ... 81 | thrust::sequence(H.begin(), H.end()); 82 | // print contents of H 83 | for(int i = 0; i < H.size(); i++) 84 | { 85 | std::cout << "H[" << i << "] = " << H[i] << std::endl; 86 | } 87 | thrust::fill(D.begin(), D.begin() + 3, 9); 88 | for(int i = 0; i < D.size(); i++) 89 | { 90 | std::cout << "D[" << i << "] = " << D[i] << std::endl; 91 | } 92 | thrust::device_vector Y(4); 93 | // compute Y = -X 94 | thrust::transform(D.begin(), D.end(), Y.begin(), thrust::negate()); 95 | for(int i = 0; i < Y.size(); i++) 96 | { 97 | std::cout << "Y[" << i << "] = " << Y[i] << std::endl; 98 | } 99 | 100 | saxpy_fast(2.5, D, Y); 101 | for(int i = 0; i < Y.size(); i++) 102 | { 103 | std::cout << "Y[" << i << "] = " << Y[i] << std::endl; 104 | } 105 | return 0; 106 | 107 | } -------------------------------------------------------------------------------- /time/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "(gdb) Launch", 9 | "type": "cppdbg", 10 | "request": "launch", 11 | "program": "enter program name, for example ${workspaceFolder}/a.out", 12 | "args": [], 13 | "stopAtEntry": false, 14 | "cwd": "${workspaceFolder}", 15 | "environment": [], 16 | "externalConsole": false, 17 | "MIMode": "gdb", 18 | "setupCommands": [ 19 | { 20 | "description": "Enable pretty-printing for gdb", 21 | "text": "-enable-pretty-printing", 22 | "ignoreFailures": true 23 | } 24 | ] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /time/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 2 | PROJECT(time) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | CUDA_ADD_EXECUTABLE(time main.cu) 5 | TARGET_LINK_LIBRARIES(time) -------------------------------------------------------------------------------- /time/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void sum(float *a, float *b) 6 | { 7 | int tid = threadIdx.x; 8 | int bid = blockIdx.x; 9 | int threadNum = blockDim.x; 10 | 11 | __shared__ float sData[512]; 12 | sData[tid] = a[bid * threadNum + tid]; 13 | __syncthreads(); 14 | for(int i = threadNum / 2; i > 0; i /= 2) 15 | { 16 | if(tid < i) 17 | { 18 | sData[tid] = sData[tid] + sData[tid + i]; 19 | } 20 | __syncthreads(); 21 | } 22 | if(tid == 0) 23 | { 24 | b[bid] = sData[0]; 25 | } 26 | } 27 | 28 | void cpuSum(float *a, float *b, int sumNum) 29 | { 30 | for(int j = 0; j < sumNum; j++) 31 | { 32 | b[j] = 0; 33 | for(int i = 0; i < sumNum; i++) 34 | { 35 | b[j] += a[i]; 36 | } 37 | } 38 | } 39 | 40 | __global__ void add(int* a, int* b, int* c, int num) 41 | { 42 | int i; 43 | int tid = threadIdx.x; 44 | int bid = blockIdx.x; 45 | int threadNum = blockDim.x; 46 | i = bid * threadNum + tid; 47 | if(i < num) 48 | { 49 | c[i] = a[i] + b[i]; 50 | } 51 | } 52 | 53 | int testSum() 54 | { 55 | int sumNum = 512; 56 | int threadNum = 1; 57 | int blockNum = 512; 58 | float a[sumNum]; 59 | for(int i = 0; i < sumNum; i++) 60 | { 61 | a[i] = i*(i+1); 62 | } 63 | float *aGpu; 64 | cudaMalloc((void**)&aGpu, sumNum * sizeof(float)); 65 | cudaMemcpy(aGpu, a, sumNum * sizeof(float), cudaMemcpyHostToDevice); 66 | 67 | float *bGpu; 68 | cudaMalloc((void**)&bGpu, sumNum * sizeof(float)); 69 | struct timeval startTime, endTime; 70 | gettimeofday(&startTime, NULL); 71 | int loopNum = 1000; 72 | for(int i = 0; i < loopNum; i++) 73 | { 74 | sum<<> >(aGpu, bGpu); 75 | } 76 | // sum<<> >(aGpu, bGpu); 77 | gettimeofday(&endTime, NULL); 78 | printf("cuda use time: %d\n", 79 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 80 | 81 | float b[sumNum]; 82 | cudaMemcpy(b, bGpu, sumNum * sizeof(float), cudaMemcpyDeviceToHost); 83 | // printf("b: %f\n",b[0]); 84 | 85 | gettimeofday(&startTime, NULL); 86 | for(int i = 0; i < loopNum; i++) 87 | { 88 | cpuSum(a, b, sumNum); 89 | } 90 | gettimeofday(&endTime, NULL); 91 | printf("cpu use time: %d\n", 92 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 93 | 94 | 95 | return 0; 96 | } 97 | 98 | int testAdd(void) 99 | { 100 | // init data 101 | int num = 5120; 102 | int threadNum = 128; 103 | int blockNum = 40; 104 | int a[num], b[num], c[num]; 105 | int *a_gpu, *b_gpu, *c_gpu; 106 | 107 | for(int i = 0; i < num; i++) 108 | { 109 | a[i] = i; 110 | b[i] = i * i; 111 | } 112 | 113 | cudaMalloc((void **)&a_gpu, num * sizeof(int)); 114 | cudaMalloc((void **)&b_gpu, num * sizeof(int)); 115 | cudaMalloc((void **)&c_gpu, num * sizeof(int)); 116 | 117 | // copy data 118 | cudaMemcpy(a_gpu, a, num * sizeof(int), cudaMemcpyHostToDevice); 119 | cudaMemcpy(b_gpu, b, num * sizeof(int), cudaMemcpyHostToDevice); 120 | 121 | struct timeval startTime, endTime; 122 | gettimeofday(&startTime, NULL); 123 | int loopNum = 10000; 124 | for(int i = 0; i < loopNum; i++) 125 | { 126 | add<<> >(a_gpu, b_gpu, c_gpu, num); 127 | } 128 | gettimeofday(&endTime, NULL); 129 | printf("cuda use time: %d\n", 130 | (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec)); 131 | 132 | 133 | // get data 134 | cudaMemcpy(c, c_gpu, num * sizeof(int), cudaMemcpyDeviceToHost); 135 | 136 | // // visualization 137 | // for(int i = 0; i < num; i++) 138 | // { 139 | // printf("%d + %d = %d\n", a[i], b[i], c[i]); 140 | // } 141 | 142 | return 0; 143 | } 144 | 145 | int main() 146 | { 147 | testAdd(); 148 | return 0; 149 | } --------------------------------------------------------------------------------