├── README ├── mmul_2.cu └── mmul_1.cu /README: -------------------------------------------------------------------------------- 1 | Here you could find the code for "Matrix multiplication on GPU using CUDA with CUBLAS, CURAND and Thrust", for more informations visit the project webpage: 2 | 3 | http://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/ 4 | 5 | You could use this program under the terms of GPL v3, for more details see: 6 | 7 | http://www.gnu.org/copyleft/gpl.html 8 | 9 | Copyright 2012 Sol from www.solarianprogrammer.com -------------------------------------------------------------------------------- /mmul_2.cu: -------------------------------------------------------------------------------- 1 | // High level matrix multiplication on GPU using CUDA with Thrust, CURAND and CUBLAS 2 | // C(m,n) = A(m,k) * B(k,n) 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | 13 | // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU 14 | void GPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { 15 | // Create a pseudo-random number generator 16 | curandGenerator_t prng; 17 | curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT); 18 | 19 | // Set the seed for the random number generator using the system clock 20 | curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock()); 21 | 22 | // Fill the array with random numbers on the device 23 | curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A); 24 | } 25 | 26 | // Multiply the arrays A and B on GPU and save the result in C 27 | // C(m,n) = A(m,k) * B(k,n) 28 | void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) { 29 | int lda=m,ldb=k,ldc=m; 30 | const float alf = 1; 31 | const float bet = 0; 32 | const float *alpha = &alf; 33 | const float *beta = &bet; 34 | 35 | // Create a handle for CUBLAS 36 | cublasHandle_t handle; 37 | cublasCreate(&handle); 38 | 39 | // Do the actual multiplication 40 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); 41 | 42 | // Destroy the handle 43 | cublasDestroy(handle); 44 | } 45 | 46 | //Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format 47 | void print_matrix(const thrust::device_vector &A, int nr_rows_A, int nr_cols_A) { 48 | 49 | for(int i = 0; i < nr_rows_A; ++i){ 50 | for(int j = 0; j < nr_cols_A; ++j){ 51 | std::cout << A[j * nr_rows_A + i] << " "; 52 | } 53 | std::cout << std::endl; 54 | } 55 | std::cout << std::endl; 56 | } 57 | 58 | int main() { 59 | // Allocate 3 arrays on CPU 60 | int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C; 61 | 62 | // for simplicity we are going to use square arrays 63 | nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 3; 64 | 65 | thrust::device_vector d_A(nr_rows_A * nr_cols_A), d_B(nr_rows_B * nr_cols_B), d_C(nr_rows_C * nr_cols_C); 66 | 67 | // Fill the arrays A and B on GPU with random numbers 68 | GPU_fill_rand(thrust::raw_pointer_cast(&d_A[0]), nr_rows_A, nr_cols_A); 69 | GPU_fill_rand(thrust::raw_pointer_cast(&d_B[0]), nr_rows_B, nr_cols_B); 70 | 71 | // Optionally we can print the data 72 | std::cout << "A =" << std::endl; 73 | print_matrix(d_A, nr_rows_A, nr_cols_A); 74 | std::cout << "B =" << std::endl; 75 | print_matrix(d_B, nr_rows_B, nr_cols_B); 76 | 77 | // Multiply A and B on GPU 78 | gpu_blas_mmul(thrust::raw_pointer_cast(&d_A[0]), thrust::raw_pointer_cast(&d_B[0]), thrust::raw_pointer_cast(&d_C[0]), nr_rows_A, nr_cols_A, nr_cols_B); 79 | 80 | //Print the result 81 | std::cout << "C =" << std::endl; 82 | print_matrix(d_C, nr_rows_C, nr_cols_C); 83 | 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /mmul_1.cu: -------------------------------------------------------------------------------- 1 | // Low level matrix multiplication on GPU using CUDA with CURAND and CUBLAS 2 | // C(m,n) = A(m,k) * B(k,n) 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU 10 | void GPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) { 11 | // Create a pseudo-random number generator 12 | curandGenerator_t prng; 13 | curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT); 14 | 15 | // Set the seed for the random number generator using the system clock 16 | curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock()); 17 | 18 | // Fill the array with random numbers on the device 19 | curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A); 20 | } 21 | 22 | // Multiply the arrays A and B on GPU and save the result in C 23 | // C(m,n) = A(m,k) * B(k,n) 24 | void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) { 25 | int lda=m,ldb=k,ldc=m; 26 | const float alf = 1; 27 | const float bet = 0; 28 | const float *alpha = &alf; 29 | const float *beta = &bet; 30 | 31 | // Create a handle for CUBLAS 32 | cublasHandle_t handle; 33 | cublasCreate(&handle); 34 | 35 | // Do the actual multiplication 36 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); 37 | 38 | // Destroy the handle 39 | cublasDestroy(handle); 40 | } 41 | 42 | 43 | //Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format 44 | void print_matrix(const float *A, int nr_rows_A, int nr_cols_A) { 45 | 46 | for(int i = 0; i < nr_rows_A; ++i){ 47 | for(int j = 0; j < nr_cols_A; ++j){ 48 | std::cout << A[j * nr_rows_A + i] << " "; 49 | } 50 | std::cout << std::endl; 51 | } 52 | std::cout << std::endl; 53 | } 54 | 55 | int main() { 56 | // Allocate 3 arrays on CPU 57 | int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C; 58 | 59 | // for simplicity we are going to use square arrays 60 | nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 3; 61 | 62 | float *h_A = (float *)malloc(nr_rows_A * nr_cols_A * sizeof(float)); 63 | float *h_B = (float *)malloc(nr_rows_B * nr_cols_B * sizeof(float)); 64 | float *h_C = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float)); 65 | 66 | // Allocate 3 arrays on GPU 67 | float *d_A, *d_B, *d_C; 68 | cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float)); 69 | cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float)); 70 | cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float)); 71 | 72 | // If you already have useful values in A and B you can copy them in GPU: 73 | // cudaMemcpy(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice); 74 | // cudaMemcpy(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice); 75 | 76 | // Fill the arrays A and B on GPU with random numbers 77 | GPU_fill_rand(d_A, nr_rows_A, nr_cols_A); 78 | GPU_fill_rand(d_B, nr_rows_B, nr_cols_B); 79 | 80 | // Optionally we can copy the data back on CPU and print the arrays 81 | cudaMemcpy(h_A,d_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyDeviceToHost); 82 | cudaMemcpy(h_B,d_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyDeviceToHost); 83 | std::cout << "A =" << std::endl; 84 | print_matrix(h_A, nr_rows_A, nr_cols_A); 85 | std::cout << "B =" << std::endl; 86 | print_matrix(h_B, nr_rows_B, nr_cols_B); 87 | 88 | // Multiply A and B on GPU 89 | gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B); 90 | 91 | // Copy (and print) the result on host memory 92 | cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost); 93 | std::cout << "C =" << std::endl; 94 | print_matrix(h_C, nr_rows_C, nr_cols_C); 95 | 96 | //Free GPU memory 97 | cudaFree(d_A); 98 | cudaFree(d_B); 99 | cudaFree(d_C); 100 | 101 | // Free CPU memory 102 | free(h_A); 103 | free(h_B); 104 | free(h_C); 105 | 106 | return 0; 107 | } 108 | --------------------------------------------------------------------------------