├── Makefile ├── README ├── prof.h ├── sgemm ├── Makefile ├── interfaces.cu ├── interfaces.h ├── kernels.cuh ├── kernels.h └── sample.cpp └── vadds ├── Makefile ├── interfaces.cu ├── interfaces.h ├── kernels.cuh └── sample.cpp /Makefile: -------------------------------------------------------------------------------- 1 | SAMPLES=vadds sgemm 2 | 3 | all:$(SAMPLES) 4 | 5 | .PHONY: vadds 6 | vadds: 7 | @(cd ./vadds && make && cd .. && mv ./vadds/sample ./sample_vadds) || exit 1 8 | 9 | .PHONY: sgemm 10 | sgemm: 11 | @(cd ./sgemm && make && cd .. && mv ./sgemm/sample ./sample_sgemm) || exit 1 12 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ASC内培示例代码 2 | for K80 平台 3 | 主要是让新人熟悉一下CUDA的写法 4 | -------------------------------------------------------------------------------- /prof.h: -------------------------------------------------------------------------------- 1 | #ifndef __4ciu7ERJN3R8n398__ 2 | #define __4ciu7ERJN3R8n398__ 3 | 4 | #include 5 | #include 6 | 7 | struct hs_timer 8 | { 9 | std::chrono::time_point start; 10 | std::chrono::time_point end; 11 | 12 | void tic(const char * name) 13 | { 14 | start = std::chrono::high_resolution_clock::now(); 15 | } 16 | 17 | void toc(const char * name) 18 | { 19 | end = std::chrono::high_resolution_clock::now(); 20 | std::cout << "[" << name << " time]: " 21 | << std::chrono::duration_cast(end - start).count() 22 | << " ms\n"; 23 | } 24 | }; 25 | 26 | #endif -------------------------------------------------------------------------------- /sgemm/Makefile: -------------------------------------------------------------------------------- 1 | CC=icpc 2 | NVCC=nvcc 3 | LD=nvcc 4 | TARGET=sample 5 | 6 | CFLAGS= -O3 -qopenmp -std=c++11 7 | CUFLAGS= -O3 -arch=sm_37 -ccbin=$(CC) -std=c++11 -Xcompiler -qopenmp 8 | LDFLAGS= -arch=sm_37 -ccbin=$(CC) -std=c++11 -lcublas -Xcompiler -qopenmp,-mkl 9 | 10 | CPOBJS= sample.o 11 | CUOBJS= interfaces.o 12 | 13 | all:build 14 | 15 | build:$(TARGET) 16 | 17 | $(TARGET): $(CPOBJS) $(CUOBJS) 18 | $(LD) $(LDFLAGS) $(CPOBJS) $(CUOBJS) -o $(TARGET) 19 | rm -rf $(CPOBJS) $(CUOBJS) 20 | 21 | $(CPOBJS): %.o: %.cpp 22 | $(CC) $(CFLAGS) -c $< 23 | 24 | $(CUOBJS): %.o: %.cu 25 | $(NVCC) $(CUFLAGS) -c $< 26 | 27 | clean: 28 | rm -rf $(CPOBJS) $(CUOBJS) 29 | -------------------------------------------------------------------------------- /sgemm/interfaces.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "interfaces.h" 9 | #include "kernels.cuh" 10 | #include "kernels.h" 11 | #include "../prof.h" 12 | 13 | void gpu_sgemm( 14 | float *a, float *b, float *c, 15 | size_t N, size_t M, size_t K, 16 | float alpha, float beta, int kernel_type) 17 | { 18 | float *dev_a = 0; 19 | float *dev_b = 0; 20 | float *dev_c = 0; 21 | 22 | cublasHandle_t handle; 23 | 24 | hs_timer timer; 25 | timer.tic("gpu sgemm"); 26 | 27 | if (kernel_type == 'b') 28 | cublasCreate(&handle); 29 | 30 | cudaMalloc((void **)&dev_a, M * K * sizeof(float)); 31 | cudaMalloc((void **)&dev_b, K * N * sizeof(float)); 32 | cudaMalloc((void **)&dev_c, M * N * sizeof(float)); 33 | 34 | cudaMemcpy(dev_a, a, M * K * sizeof(float), cudaMemcpyHostToDevice); 35 | cudaMemcpy(dev_b, b, K * N * sizeof(float), cudaMemcpyHostToDevice); 36 | cudaMemcpy(dev_c, c, M * N * sizeof(float), cudaMemcpyHostToDevice); 37 | 38 | switch (kernel_type) 39 | { 40 | case 0: 41 | { 42 | int grid_r = M / 32; 43 | int grid_c = N / 32; 44 | if (M % 32 != 0) 45 | grid_r += 1; 46 | if (N % 32 != 0) 47 | grid_c += 1; 48 | dim3 grid_d(grid_r, grid_c, 1); 49 | dim3 block_d(32, 32, 1); 50 | cuda_kernel_sgemm_0<<>>(dev_a, dev_b, dev_c, N, M, K, alpha, beta); 51 | break; 52 | } 53 | case 1: 54 | { 55 | int grid_r = M / 32; 56 | int grid_c = N / 32; 57 | if (M % 32 != 0) 58 | grid_r += 1; 59 | if (N % 32 != 0) 60 | grid_c += 1; 61 | dim3 grid_d(grid_r, grid_c, 1); 62 | dim3 block_d(32, 32, 1); 63 | cuda_kernel_sgemm_1<<>>(dev_a, dev_b, dev_c, N, M, K, alpha, beta); 64 | break; 65 | } 66 | case 2: 67 | { 68 | int grid_r = M / 32; 69 | int grid_c = N / 32; 70 | if (M % 32 != 0) 71 | grid_r += 1; 72 | if (N % 32 != 0) 73 | grid_c += 1; 74 | dim3 grid_d(grid_r, grid_c, 1); 75 | dim3 block_d(32, 32, 1); 76 | cuda_kernel_sgemm_2<<>>(dev_a, dev_b, dev_c, N, M, K, alpha, beta); 77 | break; 78 | } 79 | case 'b': 80 | { 81 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, dev_b, N, dev_a, K, &beta, dev_c, N); 82 | break; 83 | } 84 | } 85 | 86 | cudaDeviceSynchronize(); 87 | 88 | cudaMemcpy(c, dev_c, M * N * sizeof(float), cudaMemcpyDeviceToHost); 89 | 90 | if (kernel_type == 'b') 91 | cublasDestroy(handle); 92 | 93 | cudaFree(dev_a); 94 | cudaFree(dev_b); 95 | cudaFree(dev_c); 96 | 97 | timer.toc("gpu sgemm"); 98 | } 99 | 100 | void gpu_warmup() 101 | { 102 | float *dev_p = 0; 103 | 104 | hs_timer timer; 105 | timer.tic("gpu warmup"); 106 | 107 | cudaMalloc((void **)&dev_p, 16 * 32 * sizeof(float)); 108 | 109 | cuda_kernel_warmup<<<16, 32>>>(dev_p); 110 | 111 | cudaDeviceSynchronize(); 112 | 113 | cudaFree(dev_p); 114 | 115 | timer.toc("gpu warmup"); 116 | } 117 | 118 | void cpu_sgemm( 119 | float *a, float *b, float *c, 120 | size_t N, size_t M, size_t K, 121 | float alpha, float beta, int kernel_type) 122 | { 123 | hs_timer timer; 124 | timer.tic("cpu sgemm"); 125 | 126 | switch (kernel_type) 127 | { 128 | case 0: 129 | { 130 | cpu_kernel_sgemm_0(a, b, c, N, M, K, alpha, beta); 131 | break; 132 | } 133 | case 'm': 134 | { 135 | cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, M, K, alpha, b, N, a, K, beta, c, N); 136 | break; 137 | } 138 | } 139 | timer.toc("cpu sgemm"); 140 | } 141 | 142 | void cpu_warmup() 143 | { 144 | hs_timer timer; 145 | timer.tic("cpu warmup"); 146 | 147 | const size_t arr_size = 1024; 148 | float *p = new float[arr_size]; 149 | 150 | #pragma omp parallel for simd 151 | for (size_t i = 0; i < arr_size; i++) 152 | { 153 | float f = (float)i; 154 | p[i] = f * f * f; 155 | } 156 | 157 | delete p; 158 | 159 | timer.toc("cpu warmup"); 160 | } 161 | -------------------------------------------------------------------------------- /sgemm/interfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef __ad93IFM09mf__ 2 | #define __ad93IFM09mf__ 3 | 4 | #include 5 | 6 | void gpu_sgemm( 7 | float *a, float *b, float *c, 8 | size_t N, size_t M, size_t K, 9 | float alpha, float beta, int kernel_type); 10 | void gpu_warmup(); 11 | void cpu_sgemm( 12 | float *a, float *b, float *c, 13 | size_t N, size_t M, size_t K, 14 | float alpha, float beta, int kernel_type); 15 | void cpu_warmup(); 16 | #endif -------------------------------------------------------------------------------- /sgemm/kernels.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __GG43j7hdVFHUret__ 2 | #define __GG43j7hdVFHUret__ 3 | 4 | #include 5 | #include 6 | 7 | __global__ void cuda_kernel_warmup(float *p) 8 | { 9 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | float f = (float)idx; 11 | p[idx] = f * f * f; 12 | } 13 | 14 | // naive!! 15 | __global__ void cuda_kernel_sgemm_0( 16 | float *a, float *b, float *c, 17 | size_t N, size_t M, size_t K, 18 | float alpha, float beta) 19 | { 20 | int ir = blockIdx.x * 32 + threadIdx.x; // row idx in global 21 | int ic = blockIdx.y * 32 + threadIdx.y; // col idx in global 22 | 23 | if (ir < M && ic < N) 24 | { 25 | #define idx(ri, ci, nc) ((ri) * (nc) + (ci)) 26 | float acc = 0.0f; 27 | for (int k = 0; k < K; ++k) 28 | { 29 | acc += a[idx(ir, k, K)] * b[idx(k, ic, N)]; 30 | } 31 | c[idx(ir, ic, N)] = alpha * acc + beta * c[idx(ir, ic, N)]; 32 | #undef idx 33 | } 34 | } 35 | 36 | // use shared memory & tile 37 | __global__ void cuda_kernel_sgemm_1( 38 | float *a, float *b, float *c, 39 | size_t N, size_t M, size_t K, 40 | float alpha, float beta) 41 | { 42 | int tr = threadIdx.x; // row idx in block 43 | int tc = threadIdx.y; // col idx in block 44 | int ir = blockIdx.x * 32 + threadIdx.x; // row idx in global 45 | int ic = blockIdx.y * 32 + threadIdx.y; // col idx in global 46 | 47 | __shared__ float a_sub[32][32]; 48 | __shared__ float b_sub[32][32]; 49 | 50 | int load_size = K / 32; 51 | if (K % 32 != 0) 52 | { 53 | load_size += 1; 54 | } 55 | float acc = 0.0f; 56 | int a_ir = ir; 57 | int b_ic = ic; 58 | #define idx(ri, ci, nc) ((ri) * (nc) + (ci)) 59 | for (int l = 0; l < load_size; ++l) 60 | { 61 | int a_ic = l * 32 + tc; 62 | int b_ir = l * 32 + tr; 63 | a_sub[tr][tc] = 0.0f; 64 | b_sub[tr][tc] = 0.0f; 65 | if (a_ir < M && a_ic < K) 66 | a_sub[tr][tc] = a[idx(a_ir, a_ic, K)]; 67 | if (b_ir < K && b_ic < N) 68 | b_sub[tr][tc] = b[idx(b_ir, b_ic, N)]; 69 | 70 | __syncthreads(); 71 | 72 | #pragma unroll 73 | for (int k = 0; k < 32; ++k) 74 | { 75 | acc += a_sub[tr][k] * b_sub[k][tc]; 76 | } 77 | 78 | __syncthreads(); 79 | } 80 | 81 | if (ir < M && ic < N) 82 | c[idx(ir, ic, N)] = alpha * acc + beta * c[idx(ir, ic, N)]; 83 | #undef idx 84 | } 85 | 86 | // use __ldg & avoid bank conflict 87 | __global__ void cuda_kernel_sgemm_2( 88 | float *a, float *b, float *c, 89 | size_t N, size_t M, size_t K, 90 | float alpha, float beta) 91 | { 92 | int tr = threadIdx.x; // row idx in block 93 | int tc = threadIdx.y; // col idx in block 94 | int ir = blockIdx.x * 32 + threadIdx.x; // row idx in global 95 | int ic = blockIdx.y * 32 + threadIdx.y; // col idx in global 96 | 97 | __shared__ float a_sub[32][32 + 1]; // avoid bank conflict 98 | __shared__ float b_sub[32][32 + 1]; 99 | 100 | int load_size = K / 32; 101 | if (K % 32 != 0) 102 | { 103 | load_size += 1; 104 | } 105 | float acc = 0.0f; 106 | int a_ir = ir; 107 | int b_ic = ic; 108 | #define idx(ri, ci, nc) ((ri) * (nc) + (ci)) 109 | for (int l = 0; l < load_size; ++l) 110 | { 111 | int a_ic = l * 32 + tc; 112 | int b_ir = l * 32 + tr; 113 | a_sub[tr][tc] = 0.0f; 114 | b_sub[tr][tc] = 0.0f; 115 | if (a_ir < M && a_ic < K) 116 | a_sub[tr][tc] = __ldg(&a[idx(a_ir, a_ic, K)]); // cache 117 | if (b_ir < K && b_ic < N) 118 | b_sub[tr][tc] = __ldg(&b[idx(b_ir, b_ic, N)]); 119 | 120 | __syncthreads(); 121 | 122 | #pragma unroll 123 | for (int k = 0; k < 32; ++k) 124 | { 125 | acc += a_sub[tr][k] * b_sub[k][tc]; 126 | } 127 | 128 | __syncthreads(); 129 | } 130 | 131 | if (ir < M && ic < N) 132 | c[idx(ir, ic, N)] = alpha * acc + beta * c[idx(ir, ic, N)]; 133 | #undef idx 134 | } 135 | 136 | #endif -------------------------------------------------------------------------------- /sgemm/kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __fHDId9q2KID9__ 2 | #define __fHDId9q2KID9__ 3 | 4 | #include 5 | 6 | void cpu_kernel_sgemm_0( 7 | float *a, float *b, float *c, 8 | size_t N, size_t M, size_t K, 9 | float alpha, float beta) 10 | { 11 | #define idx(ri, ci, nc) ((ri) * (nc) + (ci)) 12 | float *bt = new float[K * N]; 13 | #pragma omp parallel for simd 14 | for (int n = 0; n < N; ++n) 15 | { 16 | for (int k = 0; k < K; ++k) 17 | { 18 | bt[idx(n, k, K)] = b[idx(k, n, N)]; 19 | } 20 | } 21 | #pragma omp parallel for simd 22 | for (int m = 0; m < M; ++m) 23 | { 24 | for (int n = 0; n < N; ++n) 25 | { 26 | float acc = 0.0f; 27 | for (int k = 0; k < K; ++k) 28 | { 29 | acc += a[idx(m, k, K)] * bt[idx(n, k, K)]; 30 | } 31 | c[idx(m, n, N)] = alpha * acc + beta * c[idx(m, n, N)]; 32 | } 33 | } 34 | delete bt; 35 | #undef idx 36 | } 37 | 38 | #endif -------------------------------------------------------------------------------- /sgemm/sample.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "interfaces.h" 7 | 8 | void verify(float *a, float *b, size_t arr_size, float eps) 9 | { 10 | int pass = 1; 11 | for (size_t i = 0; i < arr_size; ++i) 12 | { 13 | // relative error 14 | if (fabs((a[i] - b[i]) / a[i]) > eps) 15 | { 16 | pass = 0; 17 | printf("[wrong answer]: at= %llu, a= %f\t, b= %f\t\n", i, a[i], b[i]); 18 | break; 19 | } 20 | } 21 | } 22 | 23 | int main() 24 | { 25 | // dont use 2^n, it will cause cache crash on CPU 26 | size_t N, M, K; 27 | char *N_s = getenv("N"); 28 | char *M_s = getenv("M"); 29 | char *K_s = getenv("K"); 30 | if (N_s != NULL) 31 | N = std::max(atoi(N_s), 320); 32 | if (M_s != NULL) 33 | M = std::max(atoi(M_s), 320); 34 | if (K_s != NULL) 35 | K = std::max(atoi(K_s), 320); 36 | const float alpha = M_PI, beta = M_E; 37 | float *a = new float[M * K]; 38 | float *b = new float[K * N]; 39 | float *c1 = new float[M * N]; 40 | float *c2 = new float[M * N]; 41 | float *c3 = new float[M * N]; 42 | float *c4 = new float[M * N]; 43 | float *cb = new float[M * N]; 44 | float *cm = new float[M * N]; 45 | 46 | printf("[data size]: A(%llux%llu), B(%llux%llu)\n", M, K, K, N); 47 | 48 | #pragma omp parallel 49 | { 50 | #pragma omp for 51 | for (size_t i = 0; i < M * K; ++i) 52 | { 53 | float f = (float)i; 54 | a[i] = cosf(f) * cosf(f); 55 | } 56 | 57 | #pragma omp for 58 | for (size_t i = 0; i < K * N; ++i) 59 | { 60 | float f = (float)i; 61 | b[i] = sinf(f) * sinf(f); 62 | } 63 | #pragma omp for 64 | for (size_t i = 0; i < M * N; ++i) 65 | { 66 | float f = (float)i; 67 | c1[i] = cosf(f) * sinf(f); 68 | c2[i] = cosf(f) * sinf(f); 69 | c3[i] = cosf(f) * sinf(f); 70 | c4[i] = cosf(f) * sinf(f); 71 | cb[i] = cosf(f) * sinf(f); 72 | cm[i] = cosf(f) * sinf(f); 73 | } 74 | } 75 | 76 | gpu_warmup(); 77 | cpu_warmup(); 78 | printf("[cpu sgemm kernel 0]\n"); 79 | cpu_sgemm(a, b, c1, N, M, K, alpha, beta, 0); 80 | 81 | float eps = 1e-5; // mkl's error is larger, why? 82 | printf("[cpu sgemm kernel mkl]\n"); 83 | cpu_sgemm(a, b, cm, N, M, K, alpha, beta, 'm'); 84 | verify(c1, cm, M * N, eps); 85 | 86 | eps = 1e-6; 87 | printf("[gpu sgemm kernel 0]\n"); 88 | gpu_sgemm(a, b, c2, N, M, K, alpha, beta, 0); 89 | verify(c1, c2, M * N, eps); 90 | printf("[gpu sgemm kernel 1]\n"); 91 | gpu_sgemm(a, b, c3, N, M, K, alpha, beta, 1); 92 | verify(c1, c3, M * N, eps); 93 | printf("[gpu sgemm kernel 2]\n"); 94 | gpu_sgemm(a, b, c4, N, M, K, alpha, beta, 2); 95 | verify(c1, c4, M * N, eps); 96 | printf("[gpu sgemm kernel cublas]\n"); 97 | gpu_sgemm(a, b, cb, N, M, K, alpha, beta, 'b'); 98 | verify(c1, cb, M * N, eps); 99 | 100 | delete a; 101 | delete b; 102 | delete c1; 103 | delete c2; 104 | delete c3; 105 | delete c4; 106 | delete cb; 107 | delete cm; 108 | return 0; 109 | } -------------------------------------------------------------------------------- /vadds/Makefile: -------------------------------------------------------------------------------- 1 | CC=icpc 2 | NVCC=nvcc 3 | LD=nvcc 4 | TARGET=sample 5 | 6 | CFLAGS= -O3 -qopenmp -std=c++11 7 | CUFLAGS= -O3 -arch=sm_37 -ccbin=$(CC) -std=c++11 -Xcompiler -qopenmp 8 | LDFLAGS= -arch=sm_37 -ccbin=$(CC) -std=c++11 -Xcompiler -qopenmp 9 | 10 | CPOBJS= sample.o 11 | CUOBJS= interfaces.o 12 | 13 | all:build 14 | 15 | build:$(TARGET) 16 | 17 | $(TARGET): $(CPOBJS) $(CUOBJS) 18 | $(LD) $(LDFLAGS) $(CPOBJS) $(CUOBJS) -o $(TARGET) 19 | rm -rf $(CPOBJS) $(CUOBJS) 20 | 21 | $(CPOBJS): %.o: %.cpp 22 | $(CC) $(CFLAGS) -c $< 23 | 24 | $(CUOBJS): %.o: %.cu 25 | $(NVCC) $(CUFLAGS) -c $< 26 | 27 | clean: 28 | rm -rf $(CPOBJS) $(CUOBJS) 29 | -------------------------------------------------------------------------------- /vadds/interfaces.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "interfaces.h" 6 | #include "kernels.cuh" 7 | #include "../prof.h" 8 | 9 | void gpu_vadds( 10 | float *a, float *b, float *c, size_t arr_size, 11 | size_t grid_x, size_t block_x) 12 | { 13 | float *dev_a = 0; 14 | float *dev_b = 0; 15 | float *dev_c = 0; 16 | 17 | size_t load_size = arr_size / (grid_x * block_x); 18 | if (load_size * grid_x * block_x != arr_size) 19 | load_size += 1; 20 | size_t tot_size = load_size * grid_x * block_x; 21 | 22 | hs_timer timer; 23 | timer.tic("gpu vadds"); 24 | 25 | cudaMalloc((void **)&dev_a, tot_size * sizeof(float)); 26 | cudaMalloc((void **)&dev_b, tot_size * sizeof(float)); 27 | cudaMalloc((void **)&dev_c, tot_size * sizeof(float)); 28 | 29 | cudaMemcpy(dev_a, a, arr_size * sizeof(float), cudaMemcpyHostToDevice); 30 | cudaMemcpy(dev_b, b, arr_size * sizeof(float), cudaMemcpyHostToDevice); 31 | 32 | cuda_kernel_vadds<<>>(dev_a, dev_b, dev_c, load_size); 33 | 34 | cudaDeviceSynchronize(); 35 | 36 | cudaMemcpy(c, dev_c, arr_size * sizeof(float), cudaMemcpyDeviceToHost); 37 | 38 | cudaFree(dev_a); 39 | cudaFree(dev_b); 40 | cudaFree(dev_c); 41 | 42 | timer.toc("gpu vadds"); 43 | } 44 | 45 | void gpu_warmup() 46 | { 47 | float *dev_p = 0; 48 | 49 | hs_timer timer; 50 | timer.tic("gpu warmup"); 51 | 52 | cudaMalloc((void **)&dev_p, 16 * 32 * sizeof(float)); 53 | 54 | cuda_kernel_warmup<<<16, 32>>>(dev_p); 55 | 56 | cudaDeviceSynchronize(); 57 | 58 | cudaFree(dev_p); 59 | 60 | timer.toc("gpu warmup"); 61 | } 62 | 63 | void cpu_vadds(float *a, float *b, float *c, size_t arr_size) 64 | { 65 | hs_timer timer; 66 | timer.tic("cpu vadds"); 67 | 68 | #pragma omp parallel for simd 69 | for (size_t i = 0; i < arr_size; i++) 70 | { 71 | c[i] = a[i] + b[i]; 72 | } 73 | 74 | timer.toc("cpu vadds"); 75 | } 76 | 77 | void cpu_warmup() 78 | { 79 | hs_timer timer; 80 | timer.tic("cpu warmup"); 81 | 82 | const size_t arr_size = 1024; 83 | float *p = new float[arr_size]; 84 | 85 | #pragma omp parallel for simd 86 | for (size_t i = 0; i < arr_size; i++) 87 | { 88 | float f = (float)i; 89 | p[i] = f * f * f; 90 | } 91 | 92 | delete p; 93 | 94 | timer.toc("cpu warmup"); 95 | } 96 | -------------------------------------------------------------------------------- /vadds/interfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef __ad93IFM09mf__ 2 | #define __ad93IFM09mf__ 3 | 4 | #include 5 | 6 | void gpu_vadds( 7 | float *a, float *b, float *c, size_t arr_size, 8 | size_t grid_x, size_t block_x); 9 | void gpu_warmup(); 10 | void cpu_vadds(float *a, float *b, float *c, size_t arr_size); 11 | void cpu_warmup(); 12 | #endif -------------------------------------------------------------------------------- /vadds/kernels.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __GG43j7hdVFHUret__ 2 | #define __GG43j7hdVFHUret__ 3 | 4 | #include 5 | #include 6 | 7 | __global__ void cuda_kernel_warmup(float *p) 8 | { 9 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | float f = (float)idx; 11 | p[idx] = f * f * f; 12 | } 13 | 14 | __global__ void cuda_kernel_vadds(float *a, float *b, float *c, size_t load_size) 15 | { 16 | size_t load_idx = blockIdx.x * blockDim.x + threadIdx.x; 17 | float *_a = a + load_size * load_idx; 18 | float *_b = b + load_size * load_idx; 19 | float *_c = c + load_size * load_idx; 20 | 21 | for (size_t i = 0; i < load_size; ++i) 22 | { 23 | _c[i] = _a[i] + _b[i]; 24 | } 25 | } 26 | 27 | #endif -------------------------------------------------------------------------------- /vadds/sample.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "interfaces.h" 6 | 7 | void verify(float *a, float *b, size_t arr_size) 8 | { 9 | int pass = 1; 10 | for (size_t i = 0; i < arr_size; ++i) 11 | { 12 | if (fabs(a[i] - b[i]) > 1e-7) 13 | { 14 | pass = 0; 15 | printf("[wrong answer]: at= %llu, a= %f\t, b= %f\t\n", i, a[i], b[i]); 16 | break; 17 | } 18 | } 19 | } 20 | 21 | int main() 22 | { 23 | const size_t arr_size = 500 * 1000 * 1000; 24 | float *a = new float[arr_size]; 25 | float *b = new float[arr_size]; 26 | float *c1 = new float[arr_size]; 27 | float *c2 = new float[arr_size]; 28 | 29 | #pragma omp parallel for simd 30 | for (size_t i = 0; i < arr_size; ++i) 31 | { 32 | float f = (float)i; 33 | a[i] = sinf(f) * sinf(f); 34 | b[i] = cosf(f) * cosf(f); 35 | } 36 | gpu_warmup(); 37 | cpu_warmup(); 38 | for (int i = 1; i <= 10; ++i) 39 | { 40 | const size_t used_size = 50 * i * 1000 * 1000; 41 | printf("[test case %d]: data_size= %d B\n", i, used_size * sizeof(float)); 42 | gpu_vadds(a, b, c1, used_size, 128, 1024); 43 | cpu_vadds(a, b, c2, used_size); 44 | verify(c1, c2, used_size); 45 | } 46 | 47 | delete a; 48 | delete b; 49 | delete c1; 50 | delete c2; 51 | return 0; 52 | } --------------------------------------------------------------------------------