├── README.md ├── common_methods ├── README.md ├── print_any.cu ├── shared_mem.cu ├── streams.cu ├── threads_hierarchy_calc.cu ├── um_demo.cu └── zero_copy.cu ├── matrix_multiply ├── Makefile ├── README.md ├── imgs │ ├── 2d_block_split.png │ ├── extended_cases.png │ ├── info.txt │ ├── matmul_use_shm.png │ ├── matrix_in_mem.png │ ├── perf_v100.png │ └── sub_matrix_mul.png ├── matMul.h ├── matMul1DKernel.cu ├── matMul2DKernel.cu ├── matMulCublasKernel.cu └── testMatMul.cu ├── memory_opt ├── Makefile ├── README.md ├── device2Device.cu ├── hostAndDeviceTrans.cu ├── memoryOpt.h ├── run.sh ├── sharedMemory.cu ├── timer.h └── zeroCopy.cu ├── nccl ├── Makefile ├── README.md ├── alltoall.cu ├── comm.h ├── multi_devices_per_thread.cu ├── nccl_with_mpi.cu ├── node_client.cu ├── node_server.cu ├── nonblocking_double_streams.cu └── one_device_per_thread.cu ├── pytorch ├── torch1.13_mem_rationale │ ├── CUDACachingAllocator.cpp │ ├── CUDACachingAllocator.h │ ├── Makefile │ ├── README.md │ ├── TestAllocator.cpp │ └── llvmMathExtras.h ├── torch_ext │ ├── README.md │ ├── binding_examples │ │ ├── README.md │ │ ├── basics │ │ │ ├── classes.cc │ │ │ ├── classes_call.py │ │ │ ├── function_call.py │ │ │ └── functions.cc │ │ └── bind_practices │ │ │ ├── classes_lib.cc │ │ │ ├── classes_lib.h │ │ │ ├── classes_lib_bind.cc │ │ │ ├── classes_practice.py │ │ │ ├── functions_lib.cc │ │ │ ├── functions_lib.h │ │ │ ├── functions_lib_bind.cc │ │ │ └── functions_practice.py │ ├── easy_jit │ │ ├── demo.cu │ │ └── run.py │ ├── easy_load │ │ ├── run_inline_v1.py │ │ ├── run_inline_v2.py │ │ └── run_inline_v3.py │ ├── easy_setup │ │ ├── my_extension.cpp │ │ ├── run.py │ │ └── setup.py │ ├── lltm_demo │ │ ├── lltm_cuda.cpp │ │ ├── lltm_cuda_kernel.cu │ │ ├── run_baseline.py │ │ ├── run_custom_lltm.py │ │ └── setup.py │ └── sum_array │ │ ├── glueCode.cpp │ │ ├── run.py │ │ ├── sumArray.cu │ │ └── sumArray.h └── torch_mem_snapshot │ ├── README.md │ ├── block_fragment.py │ ├── predict_text_original_code.py │ ├── predict_text_with_snapshot_example.py │ ├── segment.py │ ├── transformer_profile.py │ └── transformer_snapshot.py └── transformer └── fused_softmax ├── README.md ├── scaled_masked_softmax.cu ├── scaled_masked_softmax.h ├── setup.py ├── torch_interface.cpp ├── utils.h └── warp_example ├── README.md └── warp_reduce.cu /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/README.md -------------------------------------------------------------------------------- /common_methods/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## compile 3 | Print information in kernel: 4 | ``` 5 | $ nvcc -lcuda print_any.cu -o print_any 6 | ``` 7 | 8 | Managed memory: 9 | ``` 10 | $ nvcc -lcuda um_demo.cu -o um_demo 11 | ``` 12 | 13 | Zero copy: 14 | ``` 15 | $ nvcc -lcuda -I../memory_opt/ zero_copy.cu -o zero_run 16 | ``` 17 | 18 | Shared memory: 19 | ``` 20 | $ nvcc -lcuda -I../memory_opt/ shared_mem.cu -o smem_run 21 | ``` 22 | 23 | Multi streams: 24 | ``` 25 | $ nvcc -lcuda streams.cu -o streamd_demo 26 | ``` 27 | 28 | ## run 29 | ``` 30 | $ ./print_any 31 | $ ./um_demo 32 | ``` 33 | 34 | ## profile 35 | ### CUDA nvprof 36 | Arch <= 7.5 e.g. Volta. 37 | ``` 38 | $ nvprof ./um_demo 39 | ``` 40 | 41 | Arch >= 8.0 e.g. Ampere: 42 | ``` 43 | $ nsys nvprof um_demo 44 | ``` 45 | ### gprof 46 | 47 | step1: compile with -pg 48 | ``` 49 | $ nvcc -pg -lcuda um_demo.cu -o um_demo 50 | ``` 51 | step2: run exe 52 | ``` 53 | $ ./um_demo 54 | ``` 55 | (will get a file: gmon.out) 56 | 57 | step3: print info 58 | ``` 59 | $ gprof ./um_demo 60 | ``` 61 | Result e.g.: 62 | ``` 63 | Flat profile: 64 | 65 | Each sample counts as 0.01 seconds. 66 | % cumulative self self total 67 | time seconds seconds calls ns/call ns/call name 68 | 62.50 0.03 0.03 1048576 23.84 23.84 std::fmax(float, float) 69 | 25.00 0.04 0.01 main 70 | 12.50 0.04 0.01 1048576 4.77 4.77 std::fabs(float) 71 | 0.00 0.04 0.00 2 0.00 0.00 cudaError cudaMallocManaged(float**, unsigned long, unsigned int) 72 | 0.00 0.04 0.00 2 0.00 0.00 dim3::dim3(unsigned int, unsigned int, unsigned int) 73 | 0.00 0.04 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 74 | 0.00 0.04 0.00 1 0.00 0.00 cudaError cudaLaunchKernel(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) 75 | 0.00 0.04 0.00 1 0.00 0.00 __device_stub__Z3addiPfS_(int, float*, float*) 76 | 0.00 0.04 0.00 1 0.00 0.00 add(int, float*, float*) 77 | 0.00 0.04 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 78 | 0.00 0.04 0.00 1 0.00 0.00 ____nv_dummy_param_ref(void*) 79 | 0.00 0.04 0.00 1 0.00 0.00 __sti____cudaRegisterAll() 80 | 0.00 0.04 0.00 1 0.00 0.00 __nv_cudaEntityRegisterCallback(void**) 81 | 0.00 0.04 0.00 1 0.00 0.00 __nv_save_fatbinhandle_for_managed_rt(void**) 82 | 83 | % the percentage of the total running time of the 84 | time program used by this function. 85 | 86 | cumulative a running sum of the number of seconds accounted 87 | seconds for by this function and those listed above it. 88 | 89 | self the number of seconds accounted for by this 90 | seconds function alone. This is the major sort for this 91 | listing. 92 | 93 | calls the number of times this function was invoked, if 94 | this function is profiled, else blank. 95 | 96 | self the average number of milliseconds spent in this 97 | ms/call function per call, if this function is profiled, 98 | else blank. 99 | 100 | total the average number of milliseconds spent in this 101 | ms/call function and its descendents per call, if this 102 | function is profiled, else blank. 103 | 104 | name the name of the function. This is the minor sort 105 | for this listing. The index shows the location of 106 | the function in the gprof listing. If the index is 107 | in parenthesis it shows where it would appear in 108 | the gprof listing if it were to be printed. 109 | ``` 110 | 111 | -------------------------------------------------------------------------------- /common_methods/print_any.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda_runtime.h" 3 | #define N 8 4 | 5 | 6 | __global__ void kernel(int mark) 7 | { 8 | if (blockIdx.x == 0 && threadIdx.x == 0) { 9 | printf(" === kernel %d run info: gridDim.x: %d, blockDim.x: %d ===\n", \ 10 | mark, gridDim.x, blockDim.x); 11 | } 12 | __syncthreads(); 13 | printf(" blockIdx.x: %d threadIdx.x: %d\n", blockIdx.x, threadIdx.x); 14 | } 15 | 16 | __global__ void kernelCalcuDim(int dimNum) 17 | { 18 | if (threadIdx.x + threadIdx.y + threadIdx.z + blockIdx.x + blockIdx.y + blockIdx.z == 0) { 19 | printf("============= The grid shape: gridDim.x: %d gridDim.y: %d gridDim.z: %d\n",\ 20 | gridDim.x, gridDim.y, gridDim.z); 21 | printf("============= The block shape: blockDim.x: %d blockDim.y: %d blockDim.z: %d\n",\ 22 | blockDim.x, blockDim.y, blockDim.z); 23 | } 24 | __syncthreads(); 25 | int offset = 0; 26 | int x, y, z; 27 | switch (dimNum) { 28 | case 1: 29 | offset = threadIdx.x + blockIdx.x * blockDim.x; 30 | break; 31 | case 2: 32 | x = threadIdx.x + blockIdx.x * blockDim.x; 33 | y = threadIdx.y + blockIdx.y * blockDim.y; 34 | offset = x + y * blockDim.x * gridDim.x; 35 | // method 2: 36 | // offset = threadIdx.x + blockDim.x * threadIdx.y + \ 37 | // (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y); 38 | break; 39 | case 3: 40 | x = threadIdx.x + blockIdx.x * blockDim.x; 41 | y = threadIdx.y + blockIdx.y * blockDim.y; 42 | z = threadIdx.z + blockIdx.z * blockDim.z; 43 | offset = x + y * blockDim.x * gridDim.x + z * blockDim.x * blockDim.y * gridDim.x * gridDim.y; 44 | break; 45 | default: 46 | break; 47 | } 48 | 49 | printf(" blockIdx: x=%d y= %d z=%d threadIdx x=%d y=%d z=%d; offset= %d\n",\ 50 | blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, offset); 51 | } 52 | 53 | 54 | int main() 55 | { 56 | printf("Case0: the diff between <<<1, N>>> with <<>>\n"); 57 | printf(" Kernel 0 invocation with N threads (1 blocks, N thread/block) N =%d\n" , N); 58 | kernel<<<1, N>>>(0); 59 | cudaDeviceSynchronize(); 60 | printf(" Kernel 1 invocation with N threads (N blocks, 1 thread/block) N =%d\n" , N); 61 | kernel<<>>(1); 62 | cudaDeviceSynchronize(); 63 | printf("\n\n"); 64 | 65 | printf("Case1: 1 dimension, grid: 2 block: 2 \n"); 66 | kernelCalcuDim<<<2, 2>>>(1); 67 | cudaDeviceSynchronize(); 68 | printf("\n"); 69 | 70 | printf("Case2: 2 dimension, grid: 2 x 1 block: 2 x 2 \n"); 71 | dim3 gridSize2D(2, 1); 72 | dim3 blockSize2D(2, 2); 73 | kernelCalcuDim<<>>(2); 74 | cudaDeviceSynchronize(); 75 | printf("\n"); 76 | 77 | printf("Case3: 3 dimension, grid: 2 x 1 x 2 block: 1 x 2 x 2 \n"); 78 | dim3 gridSize3D(2, 1, 2); 79 | dim3 blockSize3D(1, 2, 2); 80 | kernelCalcuDim<<>>(3); 81 | cudaDeviceSynchronize(); 82 | return 0; 83 | } -------------------------------------------------------------------------------- /common_methods/shared_mem.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Array sum calculation with or without shared memory in CUDA kernel. 3 | * 4 | * This demo code might be stale with the development of CUDA. 5 | * To use the latest API operations, you could see NVIDIA guide: 6 | * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html 7 | * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 8 | * 9 | * Author: kevin.xie 10 | * Email: kaiyuanxie@yeah.net 11 | * */ 12 | 13 | #include 14 | 15 | #include "memoryOpt.h" 16 | #include "timer.h" 17 | 18 | #define THREAD_PER_BLOCK 256 19 | 20 | double sumArrayInBlockCPU(float *arrData, const unsigned int dataSize) 21 | { 22 | /* This function might help you understand the process of CUDA array sum. */ 23 | float *blockData = (float *)calloc(dataSize / THREAD_PER_BLOCK, sizeof(float)); 24 | int blockSize = dataSize / THREAD_PER_BLOCK; // get integer part 25 | int idxMax = blockSize * THREAD_PER_BLOCK; 26 | 27 | // Split the array into blocks and sum the blocks one by one. 28 | for (int i = 0; i < blockSize; i++) { 29 | for (int j = 0; j < THREAD_PER_BLOCK; j++) { 30 | int idx = i * THREAD_PER_BLOCK + j; 31 | while (idx < dataSize) { 32 | blockData[i] += arrData[idx]; 33 | idx += idxMax; 34 | } 35 | } 36 | } 37 | 38 | double rst = 0.0; 39 | // sum the all blocks result; 40 | for (int i = 0; i < blockSize; ++i) { 41 | rst += blockData[i]; 42 | } 43 | return rst; 44 | } 45 | 46 | __device__ int countSHM = 0; 47 | __global__ void arraySumWithSHMKernel(float *arrData, const int dataSize) 48 | { 49 | __shared__ float shm[THREAD_PER_BLOCK]; 50 | int thIdx = threadIdx.x + blockIdx.x * blockDim.x; 51 | if (thIdx == 0) { 52 | countSHM = 0; 53 | __threadfence(); 54 | } 55 | float val = 0.0; 56 | while (thIdx < dataSize) { 57 | val += arrData[thIdx]; 58 | thIdx += blockDim.x * gridDim.x; 59 | } 60 | shm[threadIdx.x] = val; 61 | __syncthreads(); 62 | 63 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 64 | if (threadIdx.x < i) 65 | shm[threadIdx.x] += shm[threadIdx.x + i]; 66 | __syncthreads(); 67 | } 68 | 69 | __syncthreads(); 70 | bool isLast = false; 71 | thIdx = threadIdx.x + blockIdx.x * blockDim.x; 72 | if (threadIdx.x == 0) { 73 | arrData[blockIdx.x] = shm[0]; 74 | __threadfence(); 75 | int value = atomicAdd(&countSHM, 1); 76 | isLast = (value == gridDim.x - 1); 77 | } 78 | isLast = __syncthreads_or(isLast); 79 | if (isLast) { 80 | shm[threadIdx.x] = threadIdx.x < gridDim.x ? arrData[threadIdx.x] : 0; 81 | __syncthreads(); 82 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 83 | if (threadIdx.x < i) 84 | shm[threadIdx.x] += shm[threadIdx.x + i]; 85 | __syncthreads(); 86 | } 87 | __syncthreads(); 88 | if (threadIdx.x == 0) 89 | arrData[0] = shm[0]; 90 | } 91 | __syncthreads(); 92 | } 93 | 94 | __global__ void arraySumKernel(float *arrData, float *oData, const int dataSize) 95 | { 96 | // The function needed to run twice if dataSize > threads per block. 97 | 98 | int thIdx = threadIdx.x + blockIdx.x * blockDim.x; 99 | float val = 0.0; 100 | while (thIdx < dataSize) { 101 | val += arrData[thIdx]; 102 | thIdx += blockDim.x * gridDim.x; 103 | } 104 | thIdx = threadIdx.x + blockIdx.x * blockDim.x; 105 | arrData[thIdx] = val; 106 | __syncthreads(); 107 | 108 | // Reduce process: 109 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 110 | if (threadIdx.x < i) 111 | arrData[thIdx] += arrData[thIdx + i]; 112 | __syncthreads(); 113 | } 114 | __syncthreads(); 115 | 116 | if (threadIdx.x == 0) { 117 | oData[blockIdx.x] = arrData[thIdx]; 118 | } 119 | } 120 | 121 | float sumArrayGPU(const unsigned int dataSize, unsigned int iterNumber, bool useSHM) 122 | { 123 | int memSize = sizeof(float) * dataSize; 124 | float *hInData = (float *)malloc(memSize); 125 | if (hInData == 0) { 126 | fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); 127 | exit(EXIT_FAILURE); 128 | } 129 | 130 | // Get the correct result for verifying. 131 | double sum = sumArrayInBlockCPU(hInData, dataSize); 132 | 133 | float *devInData, *devOutData; 134 | float devRst; 135 | float elapsedTimeInMs = 0.0f; 136 | if (!useSHM) { 137 | checkCudaErrors(cudaMalloc((void **)&devOutData, max(dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK))); 138 | } 139 | checkCudaErrors(cudaMalloc((void **)&devInData, memSize)); 140 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 141 | 142 | cudaEvent_t start, stop; 143 | 144 | for (int i = 0; i < iterNumber; i++) { 145 | float onceTime = 0.0; 146 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 147 | if (useSHM) { 148 | TIME_ELAPSE((arraySumWithSHMKernel<<>>(devInData, dataSize)), 149 | onceTime, start, stop); 150 | } else { 151 | // Run twice to get the result. 152 | TIME_ELAPSE( 153 | (arraySumKernel<<>>(devInData, devOutData, dataSize)), 154 | onceTime, start, stop); 155 | elapsedTimeInMs += onceTime; 156 | TIME_ELAPSE((arraySumKernel<<<1, THREAD_PER_BLOCK>>>(devOutData, devOutData, dataSize / THREAD_PER_BLOCK)), 157 | onceTime, start, stop); 158 | } 159 | checkCudaErrors(cudaDeviceSynchronize()); 160 | elapsedTimeInMs += onceTime; 161 | } 162 | 163 | if (useSHM) { 164 | checkCudaErrors(cudaMemcpy(&devRst, devInData, sizeof(float), cudaMemcpyDeviceToHost)); 165 | } else { 166 | checkCudaErrors(cudaMemcpy(&devRst, devOutData, sizeof(float), cudaMemcpyDeviceToHost)); 167 | } 168 | 169 | if (fabs(devRst - sum) > 1.e-6) { 170 | printf("Result error! GPU: %f CPU: %f\n", devRst, sum); 171 | exit(EXIT_FAILURE); 172 | } 173 | free(hInData); 174 | checkCudaErrors(cudaFree(devInData)); 175 | if (!useSHM) { 176 | checkCudaErrors(cudaFree(devOutData)); 177 | } 178 | 179 | return elapsedTimeInMs / iterNumber; 180 | } 181 | 182 | int main(int argc, char **argv) 183 | { 184 | printf("[Shared Memory Application: Array Sum.] - Starting...\n"); 185 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 186 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 187 | printf(" -size=The size of numElements for testing in bytes. Default: 5000)\n"); 188 | printf(" -iter=n Iteration numbers of trans. Default:100 \n"); 189 | printf("Note: The size has a limitation. Consider float type range.)\n"); 190 | exit(EXIT_SUCCESS); 191 | } 192 | unsigned int numElements = 5000; 193 | unsigned int gpuID = 0; 194 | unsigned int iterNumber = 100; 195 | 196 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) { 197 | gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 198 | } 199 | if (checkCmdLineFlag(argc, (const char **)argv, "size")) { 200 | numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size"); 201 | } 202 | if (numElements < 256 || numElements > 10000) { 203 | printf("The size of numElements is not allowed! Support range:256~10000.\n"); 204 | printf("You could modify the source code to extend the range.\n"); 205 | exit(EXIT_FAILURE); 206 | } 207 | if (checkCmdLineFlag(argc, (const char **)argv, "iter")) { 208 | iterNumber = getCmdLineArgumentInt(argc, (const char **)argv, "iter"); 209 | } 210 | 211 | checkCudaErrors(cudaSetDevice(gpuID)); 212 | printf("Sum array with shared memory. Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, true)); 213 | printf("Sum array without shared memory. Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, false)); 214 | 215 | exit(EXIT_SUCCESS); 216 | } -------------------------------------------------------------------------------- /common_methods/threads_hierarchy_calc.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * threads hierarchy calculation example. 3 | * Author: kevin.xie 4 | * Email: kaiyuanxie@yeah.net 5 | * */ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | const float EPSILON = 1e-6; 12 | 13 | bool areFloatsEqual(float a, float b) { 14 | return std::fabs(a - b) < EPSILON; 15 | } 16 | 17 | template void check(T result, char const *const func, const char *const file, int const line) 18 | { 19 | if (result) { 20 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), 21 | cudaGetErrorString(result), func); 22 | exit(EXIT_FAILURE); 23 | } 24 | } 25 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 26 | 27 | 28 | __global__ void kernelAddOne3D3D(float *input, int dataNum) 29 | { 30 | int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y; 31 | int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y; 32 | int oneBlockSize = blockDim.x*blockDim.y*blockDim.z; 33 | int i = threadInBlock + oneBlockSize*blockInGrid; 34 | while(i < dataNum) { 35 | input[i] += 1; 36 | i += oneBlockSize * gridDim.x*gridDim.y*gridDim.z; 37 | } 38 | } 39 | 40 | 41 | __global__ void kernelAddOne2D2D(float *input, int dataNum) 42 | { 43 | // int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y; 44 | // int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y; 45 | // int oneBlockSize = blockDim.x*blockDim.y*blockDim.z; 46 | // int i = threadInBlock + oneBlockSize*blockInGrid; 47 | // when: 48 | // threadIdx.z = 0; blockIdx.z = 0; 49 | // blockDim.z = 1; gridDim.z = 1; 50 | // then: 51 | // int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x; 52 | // int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x; 53 | // int oneBlockSize = blockDim.x*blockDim.y; 54 | int i = threadIdx.x + threadIdx.y*blockDim.x + blockDim.x*blockDim.y*(blockIdx.x + blockIdx.y*gridDim.x); 55 | 56 | while(i < dataNum) { 57 | input[i] += 1; 58 | i += blockDim.x*blockDim.y*gridDim.x*gridDim.y; 59 | } 60 | // thread overflow offset = blockDim.x*blockDim.y*gridDim.x*gridDim.y; 61 | } 62 | 63 | __global__ void printIdx2D2D() 64 | { 65 | int i = threadIdx.x + threadIdx.y*blockDim.x + blockDim.x*blockDim.y*(blockIdx.x + blockIdx.y*gridDim.x); 66 | printf("Global idx %d, threadIdx.x: %d, threadIdx.y: %d threadIdx.z: %d, blockIdx.x: %d, blockIdx.y: %d, blockIdx.z: %d \n",\ 67 | i, threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z); 68 | } 69 | 70 | __global__ void kernelAddOne1D1D(float *input, int dataNum) 71 | { 72 | // int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y; 73 | // int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y; 74 | // int oneBlockSize = blockDim.x*blockDim.y*blockDim.z; 75 | // int i = threadInBlock + oneBlockSize*blockInGrid; 76 | // when: 77 | // threadIdx.y = 0; threadIdx.z = 0; blockIdx.y= 0; blockIdx.z = 0; 78 | // blockDim.y = 1; blockDim.z = 1; gridDim.y = 1; gridDim.z = 1; 79 | // then: 80 | // int threadInBlock = threadIdx.x; 81 | // int blockInGrid = blockIdx.x; 82 | // int oneBlockSize = blockDim.x; 83 | int i = threadIdx.x + blockIdx.x * blockDim.x; 84 | 85 | while(i < dataNum) { 86 | input[i] += 1; 87 | i += blockDim.x*gridDim.x; 88 | } 89 | // thread overflow offset = blockDim.x*gridDim.x; 90 | } 91 | 92 | #define TOTAL_SIZE 5000 93 | #define N 4 94 | #define M 4 95 | using kernel = void (*)(float *, int); 96 | 97 | bool test(kernel func, dim3 BlocksPerGrid, dim3 threadsPerBlock) { 98 | unsigned int totalSize = TOTAL_SIZE; 99 | float* hostData = (float*) malloc(sizeof(float) * totalSize); 100 | float* checkData = (float*) malloc(sizeof(float) * totalSize); 101 | float* devicePtr; 102 | checkCudaErrors(cudaMalloc((void**)&devicePtr, sizeof(float) * totalSize)); 103 | for (int i =0; i < totalSize; ++i) { 104 | hostData[i] = i; 105 | checkData[i] = i + 1; 106 | } 107 | checkCudaErrors(cudaMemcpy(devicePtr, hostData, totalSize * sizeof(float), cudaMemcpyHostToDevice)); 108 | func<<>>(devicePtr, totalSize); 109 | checkCudaErrors(cudaMemcpy(hostData, devicePtr, totalSize * sizeof(float), cudaMemcpyDeviceToHost)); 110 | // check result: 111 | bool rst = true; 112 | for (int i =0; i < totalSize; ++i) { 113 | if (!areFloatsEqual(checkData[i], hostData[i])) { 114 | rst = false; 115 | printf("The result not equal in data index %d. expect:%f result:%f\n", i, checkData[i], hostData[i]); 116 | break; 117 | } 118 | } 119 | checkCudaErrors(cudaFree (devicePtr)); 120 | free(hostData); 121 | free(checkData); 122 | return rst; 123 | } 124 | 125 | 126 | int main() { 127 | printf("This example is for threads hierachy calculation.\n"); 128 | // 3D3D: 129 | dim3 BlocksPerGrid(N, N, N); // 对应gridDim.x、gridDim.y、gridDim.z 130 | dim3 threadsPerBlock(M, M, M); // 对应blockDim.x、blockDim.y、blockDim.z 131 | // test(kernelAddOne3D3D, BlocksPerGrid, threadsPerBlock) 132 | 133 | // 2D2D: 134 | dim3 BlocksPerGrid2D(N, N); 135 | dim3 threadsPerBlock2D(M, M); 136 | // test(kernelAddOne2D2D, BlocksPerGrid2D, threadsPerBlock2D) 137 | 138 | // 1D1D: 139 | // test(kernelAddOne1D1D, N, M) 140 | 141 | // print the idx in threads, 2D2D example: 142 | printIdx2D2D<<>>(); 143 | 144 | bool rst = test(kernelAddOne3D3D, BlocksPerGrid, threadsPerBlock) && \ 145 | test(kernelAddOne2D2D, BlocksPerGrid2D, threadsPerBlock2D) && \ 146 | test(kernelAddOne1D1D, N, M); 147 | if(rst) { 148 | printf("The test OK.\n"); 149 | } else { 150 | printf("The test Failed.\n"); 151 | } 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /common_methods/um_demo.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | // CUDA kernel to add elements of two arrays 6 | __global__ void add(int n, float *x, float *y) 7 | { 8 | int index = blockIdx.x * blockDim.x + threadIdx.x; 9 | int stride = blockDim.x * gridDim.x; 10 | while (index < n) { 11 | y[index] = x[index] + y[index]; 12 | index += stride; 13 | } 14 | } 15 | 16 | int main(void) 17 | { 18 | int N = 1 << 20; 19 | float *x, *y; 20 | 21 | // Allocate Unified Memory -- accessible from CPU or GPU 22 | cudaMallocManaged(&x, N * sizeof(float)); 23 | cudaMallocManaged(&y, N * sizeof(float)); 24 | 25 | // initialize x and y arrays on the host 26 | for (int i = 0; i < N; i++) { 27 | x[i] = 1.0f; 28 | y[i] = 2.0f; 29 | } 30 | 31 | // Launch kernel on 1M elements on the GPU 32 | int blockSize = 256; 33 | int numBlocks = (N + blockSize - 1) / blockSize; 34 | add<<>>(N, x, y); 35 | 36 | // Wait for GPU to finish before accessing on host 37 | cudaDeviceSynchronize(); 38 | 39 | // Check for errors (all values should be 3.0f) 40 | float maxError = 0.0f; 41 | for (int i = 0; i < N; i++) 42 | maxError = fmax(maxError, fabs(y[i] - 3.0f)); 43 | std::cout << "Max error: " << maxError << std::endl; 44 | 45 | // Free memory 46 | cudaFree(x); 47 | cudaFree(y); 48 | return 0; 49 | } -------------------------------------------------------------------------------- /common_methods/zero_copy.cu: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * zero copy using in vectorAdd case. 4 | * 5 | * This demo code might be stale with the development of CUDA. 6 | * To use the latest API operations, you could see NVIDIA guide: 7 | * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html 8 | * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 9 | * 10 | * Author: kevin.xie 11 | * Email: kaiyuanxie@yeah.net 12 | */ 13 | 14 | #include "memoryOpt.h" 15 | #include "timer.h" 16 | 17 | __global__ void vectorAdd(const float *A, const float *B, float *C, const int numElements) 18 | { 19 | int i = blockDim.x * blockIdx.x + threadIdx.x; 20 | if (i < numElements) { 21 | C[i] = A[i] + B[i] + 0.0f; 22 | } 23 | } 24 | 25 | float vectorAddViaGlobalMemory(const unsigned int numElements, const unsigned int iterNum) 26 | { 27 | 28 | StopWatchInterface *timer = NULL; 29 | float elapsedTimeInMs = 0.0f; 30 | float throughputInGBs = 0.0f; 31 | 32 | sdkCreateTimer(&timer); 33 | size_t memSize = numElements * sizeof(float); 34 | 35 | // Launch the Vector Add CUDA Kernel 36 | int threadsPerBlock = 256; 37 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 38 | 39 | // Allocate the host input vector A, B, C 40 | float *h_A = (float *)malloc(memSize); 41 | float *h_B = (float *)malloc(memSize); 42 | float *h_C = (float *)malloc(memSize); 43 | 44 | // Verify that allocations succeeded 45 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 46 | fprintf(stderr, "Failed to allocate host vectors!\n"); 47 | exit(EXIT_FAILURE); 48 | } 49 | 50 | // Initialize the host input vectors 51 | for (int i = 0; i < numElements; ++i) { 52 | h_A[i] = rand() / (float)RAND_MAX; 53 | h_B[i] = rand() / (float)RAND_MAX; 54 | } 55 | 56 | // Allocate the device input vector: 57 | float *d_A = NULL; 58 | float *d_B = NULL; 59 | float *d_C = NULL; 60 | checkCudaErrors(cudaMalloc((void **)&d_A, memSize)); 61 | checkCudaErrors(cudaMalloc((void **)&d_B, memSize)); 62 | checkCudaErrors(cudaMalloc((void **)&d_C, memSize)); 63 | 64 | for (unsigned int i = 0; i < iterNum; i++) { 65 | sdkStartTimer(&timer); 66 | checkCudaErrors(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice)); 67 | checkCudaErrors(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice)); 68 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 69 | checkCudaErrors(cudaGetLastError()); 70 | // Copy the device result vector in device memory to the host result vector in host memory. 71 | checkCudaErrors(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost)); 72 | sdkStopTimer(&timer); 73 | elapsedTimeInMs += sdkGetTimerValue(&timer); 74 | sdkResetTimer(&timer); 75 | } 76 | 77 | // Verify that the result vector is correct 78 | for (int i = 0; i < numElements; ++i) { 79 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { 80 | fprintf(stderr, "Result verification failed at element %d!\n", i); 81 | exit(EXIT_FAILURE); 82 | } 83 | } 84 | 85 | // calculate throughput in GB/s. Note: use 1000(not 1024)unit. 86 | double time_s = elapsedTimeInMs / 1e3; 87 | throughputInGBs = (memSize * (float)iterNum) / (double)1e9; 88 | throughputInGBs = throughputInGBs / time_s; 89 | sdkDeleteTimer(&timer); 90 | 91 | // Free device global memory 92 | checkCudaErrors(cudaFree(d_A)); 93 | checkCudaErrors(cudaFree(d_B)); 94 | checkCudaErrors(cudaFree(d_C)); 95 | 96 | // Free host memory 97 | free(h_A); 98 | free(h_B); 99 | free(h_C); 100 | 101 | return throughputInGBs; 102 | } 103 | 104 | float vectorAddViaZeroCopy(const unsigned int numElements, const unsigned int iterNum) 105 | { 106 | 107 | StopWatchInterface *timer = NULL; 108 | float elapsedTimeInMs = 0.0f; 109 | float throughputInGBs = 0.0f; 110 | 111 | sdkCreateTimer(&timer); 112 | size_t memSize = numElements * sizeof(float); 113 | 114 | // Launch the Vector Add CUDA Kernel 115 | int threadsPerBlock = 256; 116 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 117 | 118 | checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); 119 | // Allocate the host input vector A, B, C 120 | float *h_A = NULL; 121 | float *h_B = NULL; 122 | float *h_C = NULL; 123 | float *map_A, *map_B, *map_C; 124 | // Policy1: 125 | // checkCudaErrors(cudaMallocHost((void **)&h_A, memSize)); 126 | // checkCudaErrors(cudaMallocHost((void **)&h_B, memSize)); 127 | // checkCudaErrors(cudaMallocHost((void **)&h_C, memSize)); 128 | 129 | // Policy2: 130 | checkCudaErrors(cudaHostAlloc((void **)&h_A, memSize, cudaHostAllocMapped)); 131 | checkCudaErrors(cudaHostAlloc((void **)&h_B, memSize, cudaHostAllocMapped)); 132 | checkCudaErrors(cudaHostAlloc((void **)&h_C, memSize, cudaHostAllocMapped)); 133 | 134 | // Verify that allocations succeeded 135 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 136 | fprintf(stderr, "Failed to allocate host vectors!\n"); 137 | exit(EXIT_FAILURE); 138 | } 139 | // Get the device pointers for the pinned CPU memory mapped into the GPU memory space. 140 | checkCudaErrors(cudaHostGetDevicePointer(&map_A, h_A, 0)); 141 | checkCudaErrors(cudaHostGetDevicePointer(&map_B, h_B, 0)); 142 | checkCudaErrors(cudaHostGetDevicePointer(&map_C, h_C, 0)); 143 | 144 | // Initialize the host input vectors 145 | for (int i = 0; i < numElements; ++i) { 146 | h_A[i] = rand() / (float)RAND_MAX; 147 | h_B[i] = rand() / (float)RAND_MAX; 148 | } 149 | 150 | // Copy the host input vectors A and B in host memory to the device input vectors in device memory 151 | for (unsigned int i = 0; i < iterNum; i++) { 152 | sdkStartTimer(&timer); 153 | vectorAdd<<>>(map_A, map_B, map_C, numElements); 154 | checkCudaErrors(cudaGetLastError()); 155 | // Copy the device result vector in device memory to the host result vector in host memory. 156 | sdkStopTimer(&timer); 157 | elapsedTimeInMs += sdkGetTimerValue(&timer); 158 | sdkResetTimer(&timer); 159 | } 160 | 161 | checkCudaErrors(cudaDeviceSynchronize()); 162 | // Verify that the result vector is correct 163 | for (int i = 0; i < numElements; ++i) { 164 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { 165 | fprintf(stderr, "Result verification failed at element %d!\n", i); 166 | exit(EXIT_FAILURE); 167 | } 168 | } 169 | 170 | // calculate throughput in GB/s. Note: use 1000(not 1024)unit. 171 | double time_s = elapsedTimeInMs / 1e3; 172 | throughputInGBs = (memSize * (float)iterNum) / (double)1e9; 173 | throughputInGBs = throughputInGBs / time_s; 174 | sdkDeleteTimer(&timer); 175 | 176 | // Free host memory 177 | checkCudaErrors(cudaFreeHost(h_A)); 178 | checkCudaErrors(cudaFreeHost(h_B)); 179 | checkCudaErrors(cudaFreeHost(h_C)); 180 | 181 | return throughputInGBs; 182 | } 183 | 184 | int main(int argc, char **argv) 185 | { 186 | printf("[Zero Copy Opt Vector Add] - Starting...\n"); 187 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 188 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 189 | printf(" -size=The size of numElements for testing in bytes. Default: 5000000)\n"); 190 | printf(" -iter=n Iteration numbers of trans. Default:1 \n"); 191 | exit(EXIT_SUCCESS); 192 | } 193 | unsigned int numElements = 5000000; 194 | unsigned int iterNumbers = 1; 195 | unsigned int gpuID = 0; 196 | 197 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) { 198 | gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 199 | } 200 | if (checkCmdLineFlag(argc, (const char **)argv, "size")) { 201 | numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size"); 202 | } 203 | 204 | if (checkCmdLineFlag(argc, (const char **)argv, "iter")) { 205 | iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter"); 206 | } 207 | 208 | checkCudaErrors(cudaSetDevice(gpuID)); 209 | cudaDeviceProp prop; 210 | cudaGetDeviceProperties(&prop, gpuID); 211 | if (!prop.canMapHostMemory) 212 | exit(EXIT_FAILURE); 213 | printf(">. Data tranfer via global memory. VectorAdd throughput: %f GB/s\n", 214 | vectorAddViaGlobalMemory(numElements, iterNumbers)); 215 | printf(">. Data tranfer via zero copy. VectorAdd throughput: %f GB/s\n", 216 | vectorAddViaZeroCopy(numElements, iterNumbers)); 217 | 218 | exit(EXIT_SUCCESS); 219 | } -------------------------------------------------------------------------------- /matrix_multiply/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | CUDA_PATH ?= /usr/local/cuda 4 | 5 | 6 | # architecture 7 | HOST_ARCH := $(shell uname -m) 8 | TARGET_ARCH ?= $(HOST_ARCH) 9 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) 10 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 11 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) 12 | TARGET_SIZE := 64 13 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 14 | TARGET_SIZE := 32 15 | endif 16 | else 17 | TARGET_SIZE := $(shell getconf LONG_BIT) 18 | endif 19 | else 20 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 21 | endif 22 | 23 | # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. 24 | ifeq ($(HOST_ARCH),aarch64) 25 | ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) 26 | HOST_ARCH := sbsa 27 | TARGET_ARCH := sbsa 28 | endif 29 | endif 30 | 31 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 32 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) 33 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 34 | endif 35 | endif 36 | 37 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l 38 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) 39 | TARGET_ARCH = armv7l 40 | endif 41 | 42 | # operating system 43 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 44 | TARGET_OS ?= $(HOST_OS) 45 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) 46 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 47 | endif 48 | 49 | # host compiler 50 | ifeq ($(TARGET_OS),darwin) 51 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 52 | HOST_COMPILER ?= clang++ 53 | endif 54 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 55 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 56 | ifeq ($(TARGET_OS),linux) 57 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 58 | else ifeq ($(TARGET_OS),qnx) 59 | ifeq ($(QNX_HOST),) 60 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 61 | endif 62 | ifeq ($(QNX_TARGET),) 63 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 64 | endif 65 | export QNX_HOST 66 | export QNX_TARGET 67 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 68 | else ifeq ($(TARGET_OS),android) 69 | HOST_COMPILER ?= arm-linux-androideabi-g++ 70 | endif 71 | else ifeq ($(TARGET_ARCH),aarch64) 72 | ifeq ($(TARGET_OS), linux) 73 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 74 | else ifeq ($(TARGET_OS),qnx) 75 | ifeq ($(QNX_HOST),) 76 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 77 | endif 78 | ifeq ($(QNX_TARGET),) 79 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 80 | endif 81 | export QNX_HOST 82 | export QNX_TARGET 83 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ 84 | else ifeq ($(TARGET_OS), android) 85 | HOST_COMPILER ?= aarch64-linux-android-clang++ 86 | endif 87 | else ifeq ($(TARGET_ARCH),sbsa) 88 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 89 | else ifeq ($(TARGET_ARCH),ppc64le) 90 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 91 | endif 92 | endif 93 | HOST_COMPILER ?= g++ 94 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 95 | 96 | # internal flags 97 | NVCCFLAGS := -m${TARGET_SIZE} 98 | CCFLAGS := 99 | LDFLAGS := 100 | 101 | # build flags 102 | ifeq ($(TARGET_OS),darwin) 103 | LDFLAGS += -rpath $(CUDA_PATH)/lib 104 | CCFLAGS += -arch $(HOST_ARCH) 105 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 106 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 107 | CCFLAGS += -mfloat-abi=hard 108 | else ifeq ($(TARGET_OS),android) 109 | LDFLAGS += -pie 110 | CCFLAGS += -fpie -fpic -fexceptions 111 | endif 112 | 113 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 114 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 115 | ifneq ($(TARGET_FS),) 116 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 117 | ifeq ($(GCCVERSIONLTEQ46),1) 118 | CCFLAGS += --sysroot=$(TARGET_FS) 119 | endif 120 | LDFLAGS += --sysroot=$(TARGET_FS) 121 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 122 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 123 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 124 | endif 125 | endif 126 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) 127 | ifneq ($(TARGET_FS),) 128 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 129 | ifeq ($(GCCVERSIONLTEQ46),1) 130 | CCFLAGS += --sysroot=$(TARGET_FS) 131 | endif 132 | LDFLAGS += --sysroot=$(TARGET_FS) 133 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib 134 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu 135 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib 136 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu 137 | LDFLAGS += --unresolved-symbols=ignore-in-shared-libs 138 | CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm 139 | CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu 140 | endif 141 | endif 142 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) 143 | NVCCFLAGS += -D_QNX_SOURCE 144 | NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le 145 | CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu 146 | LDFLAGS += -lsocket 147 | LDFLAGS += -L/usr/lib/aarch64-qnx-gnu 148 | CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" 149 | ifdef TARGET_OVERRIDE 150 | LDFLAGS += -lslog2 151 | endif 152 | 153 | ifneq ($(TARGET_FS),) 154 | LDFLAGS += -L$(TARGET_FS)/usr/lib 155 | CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" 156 | LDFLAGS += -L$(TARGET_FS)/usr/libnvidia 157 | CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" 158 | CCFLAGS += -I$(TARGET_FS)/../include 159 | endif 160 | endif 161 | endif 162 | 163 | ifdef TARGET_OVERRIDE # cuda toolkit targets override 164 | NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) 165 | endif 166 | 167 | # Debug build flags 168 | ifeq ($(dbg),1) 169 | NVCCFLAGS += -g -G 170 | BUILD_TYPE := debug 171 | else 172 | BUILD_TYPE := release 173 | endif 174 | 175 | ALL_CCFLAGS := 176 | ALL_CCFLAGS += $(NVCCFLAGS) 177 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 178 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 179 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 180 | 181 | SAMPLE_ENABLED := 1 182 | 183 | ALL_LDFLAGS := 184 | ALL_LDFLAGS += $(ALL_CCFLAGS) 185 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 186 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 187 | 188 | # Common includes and paths for CUDA 189 | INCLUDES := -I./ 190 | LIBRARIES := 191 | 192 | ################################################################################ 193 | 194 | # Gencode arguments 195 | ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa)) 196 | SMS ?= 53 61 70 72 75 80 86 87 197 | else 198 | SMS ?= 50 52 60 61 70 75 199 | endif 200 | 201 | ifeq ($(SMS),) 202 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 203 | SAMPLE_ENABLED := 0 204 | endif 205 | 206 | ifeq ($(GENCODE_FLAGS),) 207 | # Generate SASS code for each SM architecture listed in $(SMS) 208 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 209 | 210 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 211 | HIGHEST_SM := $(lastword $(sort $(SMS))) 212 | ifneq ($(HIGHEST_SM),) 213 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 214 | endif 215 | endif 216 | 217 | ALL_CCFLAGS += --threads 0 --std=c++11 218 | LIBRARIES += -lcublas 219 | 220 | ifeq ($(SAMPLE_ENABLED),0) 221 | EXEC ?= @echo "[@]" 222 | endif 223 | 224 | ################################################################################ 225 | 226 | all: matMul 227 | 228 | check.deps: 229 | ifeq ($(SAMPLE_ENABLED),0) 230 | @echo "Sample will be waived due to the above missing dependencies" 231 | else 232 | @echo "Sample is ready - all dependencies have been met" 233 | endif 234 | 235 | matMul: matMul1DKernel.cu matMul2DKernel.cu matMulCublasKernel.cu testMatMul.cu 236 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 237 | 238 | clean: 239 | rm -f matMul matMul.o 240 | 241 | clobber: clean 242 | -------------------------------------------------------------------------------- /matrix_multiply/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/README.md -------------------------------------------------------------------------------- /matrix_multiply/imgs/2d_block_split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/2d_block_split.png -------------------------------------------------------------------------------- /matrix_multiply/imgs/extended_cases.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/extended_cases.png -------------------------------------------------------------------------------- /matrix_multiply/imgs/info.txt: -------------------------------------------------------------------------------- 1 | Show images. 2 | -------------------------------------------------------------------------------- /matrix_multiply/imgs/matmul_use_shm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/matmul_use_shm.png -------------------------------------------------------------------------------- /matrix_multiply/imgs/matrix_in_mem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/matrix_in_mem.png -------------------------------------------------------------------------------- /matrix_multiply/imgs/perf_v100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/perf_v100.png -------------------------------------------------------------------------------- /matrix_multiply/imgs/sub_matrix_mul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/sub_matrix_mul.png -------------------------------------------------------------------------------- /matrix_multiply/matMul.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // System includes 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | #include 9 | 10 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 11 | #define STRCASECMP _stricmp 12 | #define STRNCASECMP _strnicmp 13 | #else 14 | #define STRCASECMP strcasecmp 15 | #define STRNCASECMP strncasecmp 16 | #endif 17 | 18 | #define checkCuBLASErrors(status) \ 19 | do { \ 20 | if (status != CUBLAS_STATUS_SUCCESS) { \ 21 | fprintf(stderr, "CUBLAS error: %d at %s:%d\n", status, __FILE__, __LINE__); \ 22 | exit(EXIT_FAILURE); \ 23 | } \ 24 | } while (0) 25 | 26 | template void check(T result, char const *const func, const char *const file, int const line) 27 | { 28 | if (result) { 29 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), 30 | cudaGetErrorString(result), func); 31 | exit(EXIT_FAILURE); 32 | } 33 | } 34 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 35 | 36 | inline int stringRemoveDelimiter(char delimiter, const char *string) 37 | { 38 | int string_start = 0; 39 | 40 | while (string[string_start] == delimiter) { 41 | string_start++; 42 | } 43 | 44 | if (string_start >= static_cast(strlen(string) - 1)) { 45 | return 0; 46 | } 47 | 48 | return string_start; 49 | } 50 | 51 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 52 | { 53 | bool bFound = false; 54 | 55 | if (argc >= 1) { 56 | for (int i = 1; i < argc; i++) { 57 | int string_start = stringRemoveDelimiter('-', argv[i]); 58 | const char *string_argv = &argv[i][string_start]; 59 | 60 | const char *equal_pos = strchr(string_argv, '='); 61 | int argv_length = static_cast(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 62 | 63 | int length = static_cast(strlen(string_ref)); 64 | 65 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) { 66 | bFound = true; 67 | continue; 68 | } 69 | } 70 | } 71 | 72 | return bFound; 73 | } 74 | 75 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 76 | { 77 | bool bFound = false; 78 | int value = -1; 79 | 80 | if (argc >= 1) { 81 | for (int i = 1; i < argc; i++) { 82 | int string_start = stringRemoveDelimiter('-', argv[i]); 83 | const char *string_argv = &argv[i][string_start]; 84 | int length = static_cast(strlen(string_ref)); 85 | 86 | if (!STRNCASECMP(string_argv, string_ref, length)) { 87 | if (length + 1 <= static_cast(strlen(string_argv))) { 88 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 89 | value = atoi(&string_argv[length + auto_inc]); 90 | } else { 91 | value = 0; 92 | } 93 | 94 | bFound = true; 95 | continue; 96 | } 97 | } 98 | } 99 | 100 | if (bFound) { 101 | return value; 102 | } else { 103 | return 0; 104 | } 105 | } 106 | 107 | inline void ConstantInit(float *data, int size, float val) 108 | { 109 | for (int i = 0; i < size; ++i) { 110 | data[i] = val; 111 | } 112 | } 113 | 114 | inline bool ResultCheck(float *h_C,int sizeC, int wA, const float valB) { 115 | printf("Checking computed result for correctness: "); 116 | bool correct = true; 117 | 118 | // test relative error by the formula 119 | // |_cpu - _gpu|/<|x|, |y|> < eps 120 | double eps = 1.e-6; // machine zero 121 | 122 | for (int i = 0; i < sizeC; i++) { 123 | double abs_err = fabs(h_C[i] - (wA* valB)); 124 | double dot_length = wA; 125 | double abs_val = fabs(h_C[i]); 126 | double rel_err = abs_err / abs_val / dot_length; 127 | 128 | if (rel_err > eps) { 129 | printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], wA * valB, eps); 130 | correct = false; 131 | } 132 | } 133 | 134 | printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 135 | return correct; 136 | } 137 | 138 | int MatrixMul1DTest(int argc, char **argv, int threadSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB, 139 | bool useShMem); 140 | 141 | int MatMul2DTest(int argc, char **argv, int thblockSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB, 142 | bool useAnySize); 143 | 144 | int MatMulCublasTest(int argc, char **argv, int blockSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB); 145 | -------------------------------------------------------------------------------- /matrix_multiply/matMul1DKernel.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "matMul.h" 3 | 4 | __global__ void MatMulKernel1D(float *C, float *A, float *B, const int wh, const int wC, const int hC) 5 | { 6 | const int totalSize = wC * hC; 7 | int thID = threadIdx.x + blockIdx.x * blockDim.x; 8 | while (thID < totalSize) { 9 | int Cx = thID / wC; 10 | int Cy = thID % wC; 11 | float rst = 0.0; 12 | for (int i = 0; i < wh; i++) { 13 | rst += A[Cx * wh + i] * B[i * wC + Cy]; 14 | } 15 | C[Cx * wC + Cy] = rst; 16 | thID += gridDim.x * blockDim.x; 17 | } 18 | __syncthreads(); 19 | } 20 | 21 | template 22 | __global__ void MatMulKernel1DWithShMem(float *C, float *A, float *B, const int wA, const int wC, const int hC) 23 | { 24 | __shared__ float sRow[shWASize]; // shared wA 25 | int blockID = blockIdx.x; 26 | while (blockID < hC) { 27 | int thIdx = threadIdx.x; 28 | while (thIdx < wA) { 29 | sRow[thIdx] = A[blockID * wA + thIdx]; 30 | thIdx += blockDim.x; 31 | } 32 | __syncthreads(); 33 | 34 | thIdx = threadIdx.x; 35 | while (thIdx < wC) { // wB = wC; 36 | float sum = 0.0; 37 | for (int i = 0; i < wA; i++) { 38 | sum += sRow[i] * B[wC * i + thIdx]; 39 | } 40 | C[blockID * wC + thIdx] = sum; 41 | thIdx += blockDim.x; 42 | } 43 | blockID += gridDim.x; 44 | } 45 | } 46 | 47 | 48 | /* 49 | * Run a simple test of matrix multiplication with 1D blocks. 50 | */ 51 | int MatrixMul1DTest(int argc, char **argv, int threadSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB, 52 | bool useShMem) 53 | { 54 | // Allocate host memory for matrices A and B 55 | unsigned int size_A = dimsA.x * dimsA.y; 56 | unsigned int mem_size_A = sizeof(float) * size_A; 57 | float *h_A; 58 | checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); 59 | unsigned int size_B = dimsB.x * dimsB.y; 60 | unsigned int mem_size_B = sizeof(float) * size_B; 61 | float *h_B; 62 | checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); 63 | cudaStream_t stream; 64 | 65 | // Initialize host memory 66 | const float valB = 0.01f; 67 | ConstantInit(h_A, size_A, 1.0f); 68 | ConstantInit(h_B, size_B, valB); 69 | 70 | // Allocate device memory 71 | float *d_A, *d_B, *d_C; 72 | 73 | // Allocate host matrix C 74 | dim3 dimsC(dimsB.x, dimsA.y, 1); 75 | unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 76 | float *h_C; 77 | checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); 78 | 79 | if (h_C == NULL) { 80 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 81 | exit(EXIT_FAILURE); 82 | } 83 | 84 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); 85 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); 86 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); 87 | // Allocate CUDA events that we'll use for timing 88 | cudaEvent_t start, stop; 89 | checkCudaErrors(cudaEventCreate(&start)); 90 | checkCudaErrors(cudaEventCreate(&stop)); 91 | 92 | checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 93 | 94 | // copy host memory to device 95 | checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); 96 | checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); 97 | 98 | // Setup execution parameters 99 | int grid = dimsC.x * dimsC.y / threadSize; 100 | // dim3 grid(4, 4); 101 | 102 | // Create and start timer 103 | printf("Computing result using MatrixMul1DTest Shared Mem: %d\n", useShMem); 104 | 105 | // select diff shared memory size in blocks; 106 | void (*MMKernel1DWithShMemExe)(float *C, float *A, float *B, const int wA, const int wC, const int hC); 107 | if (dimsA.x <= 256) { 108 | MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<256>; 109 | } else if (dimsA.x <= 1024) { 110 | MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<1024>; 111 | } else if (dimsA.x <= 2048) { 112 | MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<2048>; 113 | } else if (dimsA.x <= 4096) { 114 | MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<4096>; 115 | } else { 116 | // shared mem has limitation. Change the size according to your scenarios. 117 | MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<8192>; 118 | } 119 | 120 | // Performs warmup operation using matrixMul CUDA kernel 121 | if (useShMem) { 122 | MMKernel1DWithShMemExe<<>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y); 123 | } else { 124 | MatMulKernel1D<<>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y); 125 | } 126 | printf("Warmup operation done\n"); 127 | checkCudaErrors(cudaStreamSynchronize(stream)); 128 | 129 | // Record the start event 130 | checkCudaErrors(cudaEventRecord(start, stream)); 131 | 132 | // Execute the kernel 133 | for (int j = 0; j < iterNum; j++) { 134 | if (useShMem) { 135 | MMKernel1DWithShMemExe<<>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y); 136 | } else { 137 | MatMulKernel1D<<>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y); 138 | } 139 | } 140 | 141 | // Record the stop event 142 | checkCudaErrors(cudaEventRecord(stop, stream)); 143 | 144 | // Wait for the stop event to complete 145 | checkCudaErrors(cudaEventSynchronize(stop)); 146 | 147 | float msecTotal = 0.0f; 148 | checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); 149 | 150 | // Compute and print the performance 151 | float msecPerMatrixMul = msecTotal / iterNum; 152 | double flopsPerMatrixMul = 153 | 2.0 * static_cast(dimsA.x) * static_cast(dimsA.y) * static_cast(dimsB.x); 154 | double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 155 | printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," 156 | " WorkgroupSize= %u threads/block\n", 157 | gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threadSize); 158 | 159 | // Copy result from device to host 160 | checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); 161 | checkCudaErrors(cudaStreamSynchronize(stream)); 162 | 163 | bool ret = ResultCheck(h_C, static_cast(dimsC.x * dimsC.y), dimsA.x, valB); 164 | 165 | // Clean up memory 166 | checkCudaErrors(cudaFreeHost(h_A)); 167 | checkCudaErrors(cudaFreeHost(h_B)); 168 | checkCudaErrors(cudaFreeHost(h_C)); 169 | checkCudaErrors(cudaFree(d_A)); 170 | checkCudaErrors(cudaFree(d_B)); 171 | checkCudaErrors(cudaFree(d_C)); 172 | checkCudaErrors(cudaEventDestroy(start)); 173 | checkCudaErrors(cudaEventDestroy(stop)); 174 | checkCudaErrors(cudaStreamDestroy(stream)); 175 | 176 | if (ret) { 177 | return EXIT_SUCCESS; 178 | } else { 179 | return EXIT_FAILURE; 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /matrix_multiply/matMulCublasKernel.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "matMul.h" 3 | #include 4 | 5 | /** 6 | * Run a simple test of matrix multiplication using CUBLAS Sgemm. 7 | */ 8 | int MatMulCublasTest(int argc, char **argv, int blockSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB) 9 | { 10 | // Allocate host memory for matrices A and B 11 | unsigned int size_A = dimsA.x * dimsA.y; 12 | unsigned int mem_size_A = sizeof(float) * size_A; 13 | float *h_A; 14 | checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); 15 | unsigned int size_B = dimsB.x * dimsB.y; 16 | unsigned int mem_size_B = sizeof(float) * size_B; 17 | float *h_B; 18 | checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); 19 | cudaStream_t stream; 20 | 21 | // Initialize host memory 22 | const float valB = 0.01f; 23 | ConstantInit(h_A, size_A, 1.0f); 24 | ConstantInit(h_B, size_B, valB); 25 | 26 | // Allocate device memory 27 | float *d_A, *d_B, *d_C; 28 | 29 | // Allocate host matrix C 30 | dim3 dimsC(dimsB.x, dimsA.y, 1); 31 | unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 32 | float *h_C; 33 | checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); 34 | 35 | if (h_C == NULL) { 36 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 37 | exit(EXIT_FAILURE); 38 | } 39 | 40 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); 41 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); 42 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); 43 | // Allocate CUDA events that we'll use for timing 44 | cudaEvent_t start, stop; 45 | checkCudaErrors(cudaEventCreate(&start)); 46 | checkCudaErrors(cudaEventCreate(&stop)); 47 | 48 | checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 49 | 50 | // copy host memory to device 51 | checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); 52 | checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); 53 | 54 | const float alpha = 1.0f; 55 | const float beta = 0.0f; 56 | cublasHandle_t handle; 57 | checkCuBLASErrors(cublasCreate(&handle)); 58 | 59 | // Create and start timer 60 | printf("Computing result using CUBLAS Sgemmm Kernel. \n"); 61 | checkCuBLASErrors(cublasSgemm( 62 | handle, CUBLAS_OP_N, CUBLAS_OP_N, dimsB.x, dimsA.y, 63 | dimsA.x, &alpha, d_B, dimsB.x, d_A, 64 | dimsA.x, &beta, d_C, dimsB.x)); 65 | 66 | printf("Warmup operation done\n"); 67 | checkCudaErrors(cudaStreamSynchronize(stream)); 68 | 69 | // Record the start event 70 | checkCudaErrors(cudaEventRecord(start, stream)); 71 | 72 | // Execute the kernel 73 | for (int j = 0; j < iterNum; j++) { 74 | // note cublas is column primary! 75 | // need to transpose the order 76 | checkCuBLASErrors(cublasSgemm( 77 | handle, CUBLAS_OP_N, CUBLAS_OP_N, dimsB.x, dimsA.y, 78 | dimsA.x, &alpha, d_B, dimsB.x, d_A, 79 | dimsA.x, &beta, d_C, dimsB.x)); 80 | } 81 | 82 | // Record the stop event 83 | checkCudaErrors(cudaEventRecord(stop, stream)); 84 | 85 | // Wait for the stop event to complete 86 | checkCudaErrors(cudaEventSynchronize(stop)); 87 | 88 | float msecTotal = 0.0f; 89 | checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); 90 | 91 | // Compute and print the performance 92 | float msecPerMatrixMul = msecTotal / iterNum; 93 | double flopsPerMatrixMul = 94 | 2.0 * static_cast(dimsA.x) * static_cast(dimsA.y) * static_cast(dimsB.x); 95 | double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 96 | printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,", 97 | gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); 98 | 99 | // Copy result from device to host 100 | checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); 101 | checkCudaErrors(cudaStreamSynchronize(stream)); 102 | checkCuBLASErrors(cublasDestroy(handle)); 103 | 104 | bool ret = ResultCheck(h_C, static_cast(dimsC.x * dimsC.y), dimsA.x, valB); 105 | 106 | // Clean up memory 107 | checkCudaErrors(cudaFreeHost(h_A)); 108 | checkCudaErrors(cudaFreeHost(h_B)); 109 | checkCudaErrors(cudaFreeHost(h_C)); 110 | checkCudaErrors(cudaFree(d_A)); 111 | checkCudaErrors(cudaFree(d_B)); 112 | checkCudaErrors(cudaFree(d_C)); 113 | checkCudaErrors(cudaEventDestroy(start)); 114 | checkCudaErrors(cudaEventDestroy(stop)); 115 | checkCudaErrors(cudaStreamDestroy(stream)); 116 | 117 | if (ret) { 118 | return EXIT_SUCCESS; 119 | } else { 120 | return EXIT_FAILURE; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /matrix_multiply/testMatMul.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Test different version of matrix multiply. 3 | * A x B A[hA, wA] B[hB, wB] 4 | * e.g. ./matMul wA=1000 hA=312 wB=11 hB=1000 5 | * 6 | * This demo code might be stale with the development of CUDA. 7 | * To use the latest API operations, you could see NVIDIA guide: 8 | * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html 9 | * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 10 | * Author: kevin.xie 11 | * Email: kaiyuanxie@yeah.net 12 | */ 13 | 14 | #include "matMul.h" 15 | 16 | enum ALGO_TYPE { 17 | DEFAULT_MODEL, 18 | MatMul_1D_KERENL, 19 | MatMul_1D_KERNEL_WITH_SHARED_MEMORY, 20 | MatMul_2D_KERENEL_BLOCK_MULTIPLES_SIZE, 21 | MatMul_2D_KERNEL_ANY_SIZE, 22 | MatMul_CUBLAS_SGEMM_KERNEL, 23 | }; 24 | 25 | /** 26 | * Program main 27 | */ 28 | 29 | void checkResult(int ret) 30 | { 31 | if (ret != EXIT_SUCCESS) { 32 | checkCudaErrors(cudaProfilerStop()); 33 | exit(ret); 34 | } 35 | } 36 | 37 | int main(int argc, char **argv) 38 | { 39 | printf("[Matrix Multiply Test] - Starting...\n"); 40 | printf("\nNOTE: The CUDA Samples are not meant for performance " 41 | "measurements. Results may vary when GPU Boost is enabled.\n"); 42 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 43 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 44 | printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); 45 | printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); 46 | printf(" -iter=n Iteration numbers of algorithm. Default:500 \n"); 47 | printf(" -algo=[0|1|2|3|4|5] 0: Test all, 1: MatMul_1D_KERENL, 2:MatMul_1D_KERNEL_WITH_SHARED_MEMORY, " 48 | "3: MatMul_2D_KERENEL_BLOCK_MULTIPLES_SIZE, 4: MatMul_2D_KERNEL_ANY_SIZE, 5:MatMul_CUBLAS_SGEMM_KERNEL\n"); 49 | printf("Note: Outer matrix dimensions of A & B matrices" 50 | " must be equal.\n"); 51 | 52 | exit(EXIT_SUCCESS); 53 | } 54 | 55 | // int dev = 0; 56 | int blockSize = 32; 57 | int threadsPerBlock = blockSize * blockSize; 58 | 59 | // select algorithem: 60 | int algo = 0; 61 | int iterationNum = 500; 62 | 63 | // example case: 64 | dim3 dimsA(5 * 2 * blockSize, 5 * 2 * blockSize, 1); 65 | dim3 dimsB(5 * 4 * blockSize, 5 * 2 * blockSize, 1); 66 | 67 | // width of Matrix A 68 | if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { 69 | dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 70 | } 71 | 72 | // height of Matrix A 73 | if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { 74 | dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 75 | } 76 | 77 | // width of Matrix B 78 | if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { 79 | dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 80 | } 81 | 82 | // height of Matrix B 83 | if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { 84 | dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 85 | } 86 | 87 | if (checkCmdLineFlag(argc, (const char **)argv, "iter")) { 88 | iterationNum = getCmdLineArgumentInt(argc, (const char **)argv, "iter"); 89 | } 90 | 91 | if (checkCmdLineFlag(argc, (const char **)argv, "algo")) { 92 | algo = getCmdLineArgumentInt(argc, (const char **)argv, "algo"); 93 | } 94 | 95 | if (dimsA.x != dimsB.y) { 96 | printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); 97 | exit(EXIT_FAILURE); 98 | } 99 | 100 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); 101 | 102 | // int matrix_result = MatrixMul1DTest(argc, argv, 256, iterationNum, dimsA, dimsB, false); 103 | checkCudaErrors(cudaProfilerStart()); 104 | switch (algo) { 105 | case MatMul_1D_KERENL: 106 | checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, false)); 107 | break; 108 | case MatMul_1D_KERNEL_WITH_SHARED_MEMORY: 109 | checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, true)); 110 | break; 111 | case MatMul_2D_KERENEL_BLOCK_MULTIPLES_SIZE: 112 | if (dimsA.x % blockSize != 0) { 113 | printf("dim of wA must be divided by blockSize: %d\n", blockSize); 114 | exit(EXIT_FAILURE); 115 | } 116 | checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, false)); 117 | break; 118 | case MatMul_2D_KERNEL_ANY_SIZE: 119 | checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, true)); 120 | break; 121 | case MatMul_CUBLAS_SGEMM_KERNEL: 122 | checkResult(MatMulCublasTest(argc, argv, blockSize, iterationNum, dimsA, dimsB)); 123 | break; 124 | default: 125 | printf("========================= 1D blocks without shared memory =================\n"); 126 | checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, false)); 127 | printf("========================= 1D blocks with shared memory ===================\n"); 128 | checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, true)); 129 | if (dimsA.x % blockSize == 0) { 130 | printf("========================= 2D blocks with block multiples size =============\n"); 131 | checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, false)); 132 | } 133 | printf("========================= 2D blocks with any size ========================\n"); 134 | checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, true)); 135 | printf("========================= CUBLAS Sgemm kernel ========================\n"); 136 | checkResult(MatMulCublasTest(argc, argv, blockSize, iterationNum, dimsA, dimsB)); 137 | break; 138 | } 139 | 140 | checkCudaErrors(cudaProfilerStop()); 141 | exit(EXIT_SUCCESS); 142 | } 143 | -------------------------------------------------------------------------------- /memory_opt/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | CUDA_PATH ?= /usr/local/cuda 4 | 5 | 6 | # architecture 7 | HOST_ARCH := $(shell uname -m) 8 | TARGET_ARCH ?= $(HOST_ARCH) 9 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) 10 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 11 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) 12 | TARGET_SIZE := 64 13 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 14 | TARGET_SIZE := 32 15 | endif 16 | else 17 | TARGET_SIZE := $(shell getconf LONG_BIT) 18 | endif 19 | else 20 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 21 | endif 22 | 23 | # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. 24 | ifeq ($(HOST_ARCH),aarch64) 25 | ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) 26 | HOST_ARCH := sbsa 27 | TARGET_ARCH := sbsa 28 | endif 29 | endif 30 | 31 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 32 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) 33 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 34 | endif 35 | endif 36 | 37 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l 38 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) 39 | TARGET_ARCH = armv7l 40 | endif 41 | 42 | # operating system 43 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 44 | TARGET_OS ?= $(HOST_OS) 45 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) 46 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 47 | endif 48 | 49 | # host compiler 50 | ifeq ($(TARGET_OS),darwin) 51 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 52 | HOST_COMPILER ?= clang++ 53 | endif 54 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 55 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 56 | ifeq ($(TARGET_OS),linux) 57 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 58 | else ifeq ($(TARGET_OS),qnx) 59 | ifeq ($(QNX_HOST),) 60 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 61 | endif 62 | ifeq ($(QNX_TARGET),) 63 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 64 | endif 65 | export QNX_HOST 66 | export QNX_TARGET 67 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 68 | else ifeq ($(TARGET_OS),android) 69 | HOST_COMPILER ?= arm-linux-androideabi-g++ 70 | endif 71 | else ifeq ($(TARGET_ARCH),aarch64) 72 | ifeq ($(TARGET_OS), linux) 73 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 74 | else ifeq ($(TARGET_OS),qnx) 75 | ifeq ($(QNX_HOST),) 76 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 77 | endif 78 | ifeq ($(QNX_TARGET),) 79 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 80 | endif 81 | export QNX_HOST 82 | export QNX_TARGET 83 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ 84 | else ifeq ($(TARGET_OS), android) 85 | HOST_COMPILER ?= aarch64-linux-android-clang++ 86 | endif 87 | else ifeq ($(TARGET_ARCH),sbsa) 88 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 89 | else ifeq ($(TARGET_ARCH),ppc64le) 90 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 91 | endif 92 | endif 93 | HOST_COMPILER ?= g++ 94 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 95 | 96 | # internal flags 97 | NVCCFLAGS := -m${TARGET_SIZE} 98 | CCFLAGS := 99 | LDFLAGS := 100 | 101 | # build flags 102 | ifeq ($(TARGET_OS),darwin) 103 | LDFLAGS += -rpath $(CUDA_PATH)/lib 104 | CCFLAGS += -arch $(HOST_ARCH) 105 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 106 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 107 | CCFLAGS += -mfloat-abi=hard 108 | else ifeq ($(TARGET_OS),android) 109 | LDFLAGS += -pie 110 | CCFLAGS += -fpie -fpic -fexceptions 111 | endif 112 | 113 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 114 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 115 | ifneq ($(TARGET_FS),) 116 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 117 | ifeq ($(GCCVERSIONLTEQ46),1) 118 | CCFLAGS += --sysroot=$(TARGET_FS) 119 | endif 120 | LDFLAGS += --sysroot=$(TARGET_FS) 121 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 122 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 123 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 124 | endif 125 | endif 126 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) 127 | ifneq ($(TARGET_FS),) 128 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 129 | ifeq ($(GCCVERSIONLTEQ46),1) 130 | CCFLAGS += --sysroot=$(TARGET_FS) 131 | endif 132 | LDFLAGS += --sysroot=$(TARGET_FS) 133 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib 134 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu 135 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib 136 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu 137 | LDFLAGS += --unresolved-symbols=ignore-in-shared-libs 138 | CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm 139 | CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu 140 | endif 141 | endif 142 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) 143 | NVCCFLAGS += -D_QNX_SOURCE 144 | NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le 145 | CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu 146 | LDFLAGS += -lsocket 147 | LDFLAGS += -L/usr/lib/aarch64-qnx-gnu 148 | CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" 149 | ifdef TARGET_OVERRIDE 150 | LDFLAGS += -lslog2 151 | endif 152 | 153 | ifneq ($(TARGET_FS),) 154 | LDFLAGS += -L$(TARGET_FS)/usr/lib 155 | CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" 156 | LDFLAGS += -L$(TARGET_FS)/usr/libnvidia 157 | CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" 158 | CCFLAGS += -I$(TARGET_FS)/../include 159 | endif 160 | endif 161 | endif 162 | 163 | ifdef TARGET_OVERRIDE # cuda toolkit targets override 164 | NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) 165 | endif 166 | 167 | # Debug build flags 168 | ifeq ($(dbg),1) 169 | NVCCFLAGS += -g -G 170 | BUILD_TYPE := debug 171 | else 172 | BUILD_TYPE := release 173 | endif 174 | 175 | ALL_CCFLAGS := 176 | ALL_CCFLAGS += $(NVCCFLAGS) 177 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 178 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 179 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 180 | 181 | SAMPLE_ENABLED := 1 182 | 183 | ALL_LDFLAGS := 184 | ALL_LDFLAGS += $(ALL_CCFLAGS) 185 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 186 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 187 | 188 | # Common includes and paths for CUDA 189 | INCLUDES := -I./ 190 | LIBRARIES := 191 | 192 | ################################################################################ 193 | 194 | # Gencode arguments 195 | ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa)) 196 | SMS ?= 53 61 70 72 75 80 86 87 197 | else 198 | SMS ?= 35 37 50 52 60 61 70 75 199 | endif 200 | 201 | ifeq ($(SMS),) 202 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 203 | SAMPLE_ENABLED := 0 204 | endif 205 | 206 | ifeq ($(GENCODE_FLAGS),) 207 | # Generate SASS code for each SM architecture listed in $(SMS) 208 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 209 | 210 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 211 | HIGHEST_SM := $(lastword $(sort $(SMS))) 212 | ifneq ($(HIGHEST_SM),) 213 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 214 | endif 215 | endif 216 | 217 | ALL_CCFLAGS += --threads 0 --std=c++11 218 | LIBRARIES += -lcublas 219 | 220 | ifeq ($(SAMPLE_ENABLED),0) 221 | EXEC ?= @echo "[@]" 222 | endif 223 | 224 | ################################################################################ 225 | 226 | all: testHost2Device testDevice2Device testSharedMemory testZeroCopy 227 | 228 | check.deps: 229 | ifeq ($(SAMPLE_ENABLED),0) 230 | @echo "Sample will be waived due to the above missing dependencies" 231 | else 232 | @echo "Sample is ready - all dependencies have been met" 233 | endif 234 | 235 | testHost2Device: hostAndDeviceTrans.cu 236 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 237 | 238 | testDevice2Device: device2Device.cu 239 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 240 | 241 | testSharedMemory: sharedMemory.cu 242 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 243 | 244 | testZeroCopy: zeroCopy.cu 245 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 246 | 247 | clean: 248 | rm -f testHost2Device testDevice2Device testSharedMemory testZeroCopy 249 | 250 | clobber: clean 251 | -------------------------------------------------------------------------------- /memory_opt/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/memory_opt/README.md -------------------------------------------------------------------------------- /memory_opt/device2Device.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory transfer between device and device example to help you understand the process. 3 | * 4 | * This demo code might be stale with the development of CUDA. 5 | * To use the latest API operations, you could see NVIDIA guide: 6 | * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html 7 | * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 8 | * 9 | * Author: kevin.xie 10 | * Email: kaiyuanxie@yeah.net 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "memoryOpt.h" 18 | #include "timer.h" 19 | 20 | /* 21 | * transfer data in device itself. 22 | */ 23 | float deviceToItself(const unsigned int memSize, const unsigned int iterNum) 24 | { 25 | 26 | float elapsedTimeInMs = 0.0f; 27 | float bandwidthInGBs = 0.0f; 28 | unsigned char *devInData; 29 | unsigned char *devOutData; 30 | cudaEvent_t start, stop; 31 | checkCudaErrors(cudaEventCreate(&start)); 32 | checkCudaErrors(cudaEventCreate(&stop)); 33 | 34 | // allocate host memory 35 | unsigned char *hInData = (unsigned char *)malloc(memSize); 36 | 37 | if (hInData == 0) { 38 | fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | 42 | for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { 43 | hInData[i] = (unsigned char)(i & 0xff); 44 | } 45 | 46 | 47 | checkCudaErrors(cudaMalloc((void **)&devInData, memSize)); 48 | checkCudaErrors(cudaMalloc((void **)&devOutData, memSize)); 49 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 50 | checkCudaErrors(cudaEventRecord(start, 0)); 51 | 52 | for (unsigned int i = 0; i < iterNum; i++) { 53 | checkCudaErrors(cudaMemcpy(devOutData, devInData, memSize, cudaMemcpyDeviceToDevice)); 54 | } 55 | 56 | checkCudaErrors(cudaEventRecord(stop, 0)); 57 | checkCudaErrors(cudaDeviceSynchronize()); 58 | checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); 59 | 60 | // In and Out, mutilpy 2.0 factor. Note: use 1000(not 1024)unit. 61 | double time_s = elapsedTimeInMs / 1e3; 62 | bandwidthInGBs = (2.0f * memSize * (float)iterNum) / (double)1e9; 63 | bandwidthInGBs = bandwidthInGBs / time_s; 64 | 65 | free(hInData); 66 | checkCudaErrors(cudaEventDestroy(stop)); 67 | checkCudaErrors(cudaEventDestroy(start)); 68 | checkCudaErrors(cudaFree(devInData)); 69 | checkCudaErrors(cudaFree(devOutData)); 70 | return bandwidthInGBs; 71 | } 72 | 73 | /* 74 | * transfer data from one devcie to another without peer-to-peer opt. 75 | */ 76 | float deviceToDeviceWithoutP2P(const unsigned int memSize, const unsigned int iterNum, const unsigned int GPUA, 77 | const unsigned int GPUB) 78 | { 79 | 80 | float elapsedTimeInMs = 0.0f; 81 | float bandwidthInGBs = 0.0f; 82 | unsigned char *devInData; 83 | unsigned char *devOutData; 84 | cudaEvent_t start, stop; 85 | checkCudaErrors(cudaEventCreate(&start)); 86 | checkCudaErrors(cudaEventCreate(&stop)); 87 | 88 | // allocate host memory 89 | unsigned char *hInData = (unsigned char *)malloc(memSize); 90 | 91 | if (hInData == 0) { 92 | fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); 93 | exit(EXIT_FAILURE); 94 | } 95 | 96 | for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { 97 | hInData[i] = (unsigned char)(i & 0xff); 98 | } 99 | 100 | cudaSetDevice(GPUA); 101 | checkCudaErrors(cudaMalloc((void **)&devInData, memSize)); 102 | cudaSetDevice(GPUB); 103 | checkCudaErrors(cudaMalloc((void **)&devOutData, memSize)); 104 | cudaSetDevice(GPUA); 105 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 106 | 107 | checkCudaErrors(cudaEventRecord(start, 0)); 108 | for (unsigned int i = 0; i < iterNum; i++) { 109 | checkCudaErrors(cudaMemcpy(devOutData, devInData, memSize, cudaMemcpyDeviceToDevice)); 110 | } 111 | 112 | checkCudaErrors(cudaEventRecord(stop, 0)); 113 | checkCudaErrors(cudaDeviceSynchronize()); 114 | checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); 115 | 116 | // In and Out. Note: use 1000(not 1024)unit. 117 | double time_s = elapsedTimeInMs / 1e3; 118 | bandwidthInGBs = (memSize * (float)iterNum) / (double)1e9; 119 | bandwidthInGBs = bandwidthInGBs / time_s; 120 | 121 | free(hInData); 122 | checkCudaErrors(cudaEventDestroy(stop)); 123 | checkCudaErrors(cudaEventDestroy(start)); 124 | checkCudaErrors(cudaFree(devInData)); 125 | checkCudaErrors(cudaFree(devOutData)); 126 | return bandwidthInGBs; 127 | } 128 | 129 | /* 130 | * transfer data from one devcie to another with peer-to-peer opt. 131 | */ 132 | float deviceToDeviceWithP2P(const unsigned int memSize, const unsigned int iterNum, const unsigned int GPUA, 133 | const unsigned int GPUB) 134 | { 135 | 136 | float elapsedTimeInMs = 0.0f; 137 | float bandwidthInGBs = 0.0f; 138 | unsigned char *devInData; 139 | unsigned char *devOutData; 140 | 141 | cudaEvent_t start, stop; 142 | checkCudaErrors(cudaEventCreate(&start)); 143 | checkCudaErrors(cudaEventCreate(&stop)); 144 | 145 | // allocate host memory 146 | unsigned char *hInData = (unsigned char *)malloc(memSize); 147 | 148 | if (hInData == 0) { 149 | fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); 150 | exit(EXIT_FAILURE); 151 | } 152 | 153 | for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { 154 | hInData[i] = (unsigned char)(i & 0xff); 155 | } 156 | checkCudaErrors(cudaSetDevice(GPUA)); 157 | 158 | // enable GPUA access GPUB 159 | checkCudaErrors(cudaDeviceEnablePeerAccess(GPUB, 0)); 160 | checkCudaErrors(cudaMalloc((void **)&devInData, memSize)); 161 | checkCudaErrors(cudaSetDevice(GPUB)); 162 | // enable GPUB access GPUA 163 | checkCudaErrors(cudaDeviceEnablePeerAccess(GPUA, 0)); 164 | checkCudaErrors(cudaMalloc((void **)&devOutData, memSize)); 165 | checkCudaErrors(cudaSetDevice(GPUA)); 166 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 167 | 168 | 169 | checkCudaErrors(cudaEventRecord(start, 0)); 170 | for (unsigned int i = 0; i < iterNum; i++) { 171 | checkCudaErrors(cudaMemcpy(devOutData, devInData, memSize, cudaMemcpyDeviceToDevice)); 172 | } 173 | checkCudaErrors(cudaEventRecord(stop, 0)); 174 | checkCudaErrors(cudaDeviceSynchronize()); 175 | checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); 176 | 177 | // In and Out. Note: use 1000(not 1024)unit. 178 | double time_s = elapsedTimeInMs / 1e3; 179 | bandwidthInGBs = (memSize * (float)iterNum) / (double)1e9; 180 | bandwidthInGBs = bandwidthInGBs / time_s; 181 | 182 | free(hInData); 183 | checkCudaErrors(cudaEventDestroy(stop)); 184 | checkCudaErrors(cudaEventDestroy(start)); 185 | checkCudaErrors(cudaFree(devInData)); 186 | checkCudaErrors(cudaFree(devOutData)); 187 | return bandwidthInGBs; 188 | } 189 | 190 | int main(int argc, char **argv) 191 | { 192 | printf("[Device to Device Memory Opt Demo:] - Starting...\n"); 193 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 194 | printf("Usage -deviceA=n (n >= 0 for deviceID A. Default:0)\n"); 195 | printf(" -deviceB=n (n >= 0 for deviceID B. Default:1)\n"); 196 | printf(" -size=The size of memory for testing in bytes. Default: 20*1024*1024)\n"); 197 | printf(" -iter=n Iteration numbers of trans. Default:100 \n"); 198 | exit(EXIT_SUCCESS); 199 | } 200 | unsigned int memSize = 1024 * 1024 * 20; 201 | unsigned int iterNumbers = 100; 202 | unsigned int GPUA = 0; 203 | unsigned int GPUB = 1; 204 | 205 | if (checkCmdLineFlag(argc, (const char **)argv, "deviceA")) { 206 | GPUA = getCmdLineArgumentInt(argc, (const char **)argv, "deviceA"); 207 | } 208 | if (checkCmdLineFlag(argc, (const char **)argv, "deviceB")) { 209 | GPUB = getCmdLineArgumentInt(argc, (const char **)argv, "deviceB"); 210 | } 211 | if (checkCmdLineFlag(argc, (const char **)argv, "size")) { 212 | memSize = getCmdLineArgumentInt(argc, (const char **)argv, "size"); 213 | } 214 | 215 | if (checkCmdLineFlag(argc, (const char **)argv, "iter")) { 216 | iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter"); 217 | } 218 | checkCudaErrors(cudaSetDevice(GPUA)); 219 | printf(">. Device to itself transfer. Bandwith: %f GB/s\n", deviceToItself(memSize, iterNumbers)); 220 | printf(">. Device to device transfer without p2p. Bandwith: %f GB/s\n", 221 | deviceToDeviceWithoutP2P(memSize, iterNumbers, GPUA, GPUB)); 222 | printf(">. Device to device transfer with p2p Bandwith: %f GB/s\n", 223 | deviceToDeviceWithP2P(memSize, iterNumbers, GPUA, GPUB)); 224 | 225 | exit(EXIT_SUCCESS); 226 | } -------------------------------------------------------------------------------- /memory_opt/memoryOpt.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // System includes 3 | #include 4 | #include 5 | 6 | // CUDA runtime 7 | #include 8 | 9 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 10 | #define STRCASECMP _stricmp 11 | #define STRNCASECMP _strnicmp 12 | #else 13 | #define STRCASECMP strcasecmp 14 | #define STRNCASECMP strncasecmp 15 | #endif 16 | 17 | template void check(T result, char const *const func, const char *const file, int const line) 18 | { 19 | if (result) { 20 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), 21 | cudaGetErrorString(result), func); 22 | exit(EXIT_FAILURE); 23 | } 24 | } 25 | 26 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 27 | 28 | #define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M 29 | 30 | #define TIME_ELAPSE(func, elapsedTime, start, stop) \ 31 | cudaEventCreate(&start); \ 32 | cudaEventCreate(&stop); \ 33 | cudaEventRecord(start, 0); \ 34 | (func); \ 35 | cudaEventRecord(stop, 0); \ 36 | cudaEventSynchronize(stop); \ 37 | cudaEventElapsedTime(&elapsedTime, start, stop); \ 38 | cudaEventDestroy(start); \ 39 | cudaEventDestroy(stop); 40 | 41 | inline int stringRemoveDelimiter(char delimiter, const char *string) 42 | { 43 | int string_start = 0; 44 | 45 | while (string[string_start] == delimiter) { 46 | string_start++; 47 | } 48 | 49 | if (string_start >= static_cast(strlen(string) - 1)) { 50 | return 0; 51 | } 52 | 53 | return string_start; 54 | } 55 | 56 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 57 | { 58 | bool bFound = false; 59 | 60 | if (argc >= 1) { 61 | for (int i = 1; i < argc; i++) { 62 | int string_start = stringRemoveDelimiter('-', argv[i]); 63 | const char *string_argv = &argv[i][string_start]; 64 | 65 | const char *equal_pos = strchr(string_argv, '='); 66 | int argv_length = static_cast(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 67 | 68 | int length = static_cast(strlen(string_ref)); 69 | 70 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) { 71 | bFound = true; 72 | continue; 73 | } 74 | } 75 | } 76 | 77 | return bFound; 78 | } 79 | 80 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 81 | { 82 | bool bFound = false; 83 | int value = -1; 84 | 85 | if (argc >= 1) { 86 | for (int i = 1; i < argc; i++) { 87 | int string_start = stringRemoveDelimiter('-', argv[i]); 88 | const char *string_argv = &argv[i][string_start]; 89 | int length = static_cast(strlen(string_ref)); 90 | 91 | if (!STRNCASECMP(string_argv, string_ref, length)) { 92 | if (length + 1 <= static_cast(strlen(string_argv))) { 93 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 94 | value = atoi(&string_argv[length + auto_inc]); 95 | } else { 96 | value = 0; 97 | } 98 | 99 | bFound = true; 100 | continue; 101 | } 102 | } 103 | } 104 | 105 | if (bFound) { 106 | return value; 107 | } else { 108 | return 0; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /memory_opt/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # GPU memory operation demo. 3 | # Author: kevin.xie 4 | # Email: kaiyuanxie@yeah.net 5 | 6 | 7 | set -e 8 | current_path=$(cd `dirname $0`; pwd) 9 | make 10 | echo "Run all demo:" 11 | 12 | if [ ! -f ${current_path}/testHost2Device ] 13 | then 14 | echo "testHost2Device exe file not found!" 15 | exit 1 16 | fi 17 | ./testHost2Device 18 | echo "[Next]" 19 | 20 | if [ ! -f ${current_path}/testDevice2Device ] 21 | then 22 | echo "testDevice2Device exe file not found!" 23 | exit 1 24 | fi 25 | ./testDevice2Device 26 | echo "[Next]" 27 | 28 | if [ ! -f ${current_path}/testSharedMemory ] 29 | then 30 | echo "testSharedMemory exe file not found!" 31 | exit 1 32 | fi 33 | ./testSharedMemory 34 | echo "[Next]" 35 | 36 | if [ ! -f ${current_path}/testZeroCopy ] 37 | then 38 | echo "testZeroCopy exe file not found!" 39 | exit 1 40 | fi 41 | ./testZeroCopy 42 | exit 0 43 | 44 | -------------------------------------------------------------------------------- /memory_opt/sharedMemory.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Array sum calculation with or without shared memory in CUDA kernel. 3 | * 4 | * This demo code might be stale with the development of CUDA. 5 | * To use the latest API operations, you could see NVIDIA guide: 6 | * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html 7 | * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 8 | * 9 | * Author: kevin.xie 10 | * Email: kaiyuanxie@yeah.net 11 | * */ 12 | 13 | #include 14 | 15 | #include "memoryOpt.h" 16 | #include "timer.h" 17 | 18 | #define THREAD_PER_BLOCK 256 19 | 20 | double sumArrayInBlockCPU(float *arrData, const unsigned int dataSize) 21 | { 22 | /* This function might help you understand the process of CUDA array sum. */ 23 | float *blockData = (float *)calloc(dataSize / THREAD_PER_BLOCK, sizeof(float)); 24 | int blockSize = dataSize / THREAD_PER_BLOCK; // get integer part 25 | int idxMax = blockSize * THREAD_PER_BLOCK; 26 | 27 | // Split the array into blocks and sum the blocks one by one. 28 | for (int i = 0; i < blockSize; i++) { 29 | for (int j = 0; j < THREAD_PER_BLOCK; j++) { 30 | int idx = i * THREAD_PER_BLOCK + j; 31 | while (idx < dataSize) { 32 | blockData[i] += arrData[idx]; 33 | idx += idxMax; 34 | } 35 | } 36 | } 37 | 38 | double rst = 0.0; 39 | // sum the all blocks result; 40 | for (int i = 0; i < blockSize; ++i) { 41 | rst += blockData[i]; 42 | } 43 | return rst; 44 | } 45 | 46 | __device__ int countSHM = 0; 47 | __global__ void arraySumWithSHMKernel(float *arrData, const int dataSize) 48 | { 49 | __shared__ float shm[THREAD_PER_BLOCK]; 50 | int thIdx = threadIdx.x + blockIdx.x * blockDim.x; 51 | if (thIdx == 0) { 52 | countSHM = 0; 53 | __threadfence(); 54 | } 55 | float val = 0.0; 56 | while (thIdx < dataSize) { 57 | val += arrData[thIdx]; 58 | thIdx += blockDim.x * gridDim.x; 59 | } 60 | shm[threadIdx.x] = val; 61 | __syncthreads(); 62 | 63 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 64 | if (threadIdx.x < i) 65 | shm[threadIdx.x] += shm[threadIdx.x + i]; 66 | __syncthreads(); 67 | } 68 | 69 | __syncthreads(); 70 | bool isLast = false; 71 | thIdx = threadIdx.x + blockIdx.x * blockDim.x; 72 | if (threadIdx.x == 0) { 73 | arrData[blockIdx.x] = shm[0]; 74 | __threadfence(); 75 | int value = atomicAdd(&countSHM, 1); 76 | isLast = (value == gridDim.x - 1); 77 | } 78 | isLast = __syncthreads_or(isLast); 79 | if (isLast) { 80 | shm[threadIdx.x] = threadIdx.x < gridDim.x ? arrData[threadIdx.x] : 0; 81 | __syncthreads(); 82 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 83 | if (threadIdx.x < i) 84 | shm[threadIdx.x] += shm[threadIdx.x + i]; 85 | __syncthreads(); 86 | } 87 | __syncthreads(); 88 | if (threadIdx.x == 0) 89 | arrData[0] = shm[0]; 90 | } 91 | __syncthreads(); 92 | } 93 | 94 | __global__ void arraySumKernel(float *arrData, float *oData, const int dataSize) 95 | { 96 | // The function needed to run twice if dataSize > threads per block. 97 | 98 | int thIdx = threadIdx.x + blockIdx.x * blockDim.x; 99 | float val = 0.0; 100 | while (thIdx < dataSize) { 101 | val += arrData[thIdx]; 102 | thIdx += blockDim.x * gridDim.x; 103 | } 104 | thIdx = threadIdx.x + blockIdx.x * blockDim.x; 105 | arrData[thIdx] = val; 106 | __syncthreads(); 107 | 108 | // Reduce process: 109 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 110 | if (threadIdx.x < i) 111 | arrData[thIdx] += arrData[thIdx + i]; 112 | __syncthreads(); 113 | } 114 | __syncthreads(); 115 | 116 | if (threadIdx.x == 0) { 117 | oData[blockIdx.x] = arrData[thIdx]; 118 | } 119 | } 120 | 121 | float sumArrayGPU(const unsigned int dataSize, unsigned int iterNumber, bool useSHM) 122 | { 123 | int memSize = sizeof(float) * dataSize; 124 | float *hInData = (float *)malloc(memSize); 125 | if (hInData == 0) { 126 | fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); 127 | exit(EXIT_FAILURE); 128 | } 129 | 130 | // Get the correct result for verifying. 131 | double sum = sumArrayInBlockCPU(hInData, dataSize); 132 | 133 | float *devInData, *devOutData; 134 | float devRst; 135 | float elapsedTimeInMs = 0.0f; 136 | if (!useSHM) { 137 | checkCudaErrors(cudaMalloc((void **)&devOutData, max(dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK))); 138 | } 139 | checkCudaErrors(cudaMalloc((void **)&devInData, memSize)); 140 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 141 | 142 | cudaEvent_t start, stop; 143 | 144 | for (int i = 0; i < iterNumber; i++) { 145 | float onceTime = 0.0; 146 | checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice)); 147 | if (useSHM) { 148 | TIME_ELAPSE((arraySumWithSHMKernel<<>>(devInData, dataSize)), 149 | onceTime, start, stop); 150 | } else { 151 | // Run twice to get the result. 152 | TIME_ELAPSE( 153 | (arraySumKernel<<>>(devInData, devOutData, dataSize)), 154 | onceTime, start, stop); 155 | elapsedTimeInMs += onceTime; 156 | TIME_ELAPSE((arraySumKernel<<<1, THREAD_PER_BLOCK>>>(devOutData, devOutData, dataSize / THREAD_PER_BLOCK)), 157 | onceTime, start, stop); 158 | } 159 | checkCudaErrors(cudaDeviceSynchronize()); 160 | elapsedTimeInMs += onceTime; 161 | } 162 | 163 | if (useSHM) { 164 | checkCudaErrors(cudaMemcpy(&devRst, devInData, sizeof(float), cudaMemcpyDeviceToHost)); 165 | } else { 166 | checkCudaErrors(cudaMemcpy(&devRst, devOutData, sizeof(float), cudaMemcpyDeviceToHost)); 167 | } 168 | 169 | if (fabs(devRst - sum) > 1.e-6) { 170 | printf("Result error! GPU: %f CPU: %f\n", devRst, sum); 171 | exit(EXIT_FAILURE); 172 | } 173 | free(hInData); 174 | checkCudaErrors(cudaFree(devInData)); 175 | if (!useSHM) { 176 | checkCudaErrors(cudaFree(devOutData)); 177 | } 178 | 179 | return elapsedTimeInMs / iterNumber; 180 | } 181 | 182 | int main(int argc, char **argv) 183 | { 184 | printf("[Shared Memory Application: Array Sum.] - Starting...\n"); 185 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 186 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 187 | printf(" -size=The size of numElements for testing in bytes. Default: 5000)\n"); 188 | printf(" -iter=n Iteration numbers of trans. Default:100 \n"); 189 | printf("Note: The size has a limitation. Consider float type range.)\n"); 190 | exit(EXIT_SUCCESS); 191 | } 192 | unsigned int numElements = 5000; 193 | unsigned int gpuID = 0; 194 | unsigned int iterNumber = 100; 195 | 196 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) { 197 | gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 198 | } 199 | if (checkCmdLineFlag(argc, (const char **)argv, "size")) { 200 | numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size"); 201 | } 202 | if (numElements < 256 || numElements > 10000) { 203 | printf("The size of numElements is not allowed! Support range:256~10000.\n"); 204 | printf("You could modify the source code to extend the range.\n"); 205 | exit(EXIT_FAILURE); 206 | } 207 | if (checkCmdLineFlag(argc, (const char **)argv, "iter")) { 208 | iterNumber = getCmdLineArgumentInt(argc, (const char **)argv, "iter"); 209 | } 210 | 211 | checkCudaErrors(cudaSetDevice(gpuID)); 212 | printf("Sum array with shared memory. Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, true)); 213 | printf("Sum array without shared memory. Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, false)); 214 | 215 | exit(EXIT_SUCCESS); 216 | } -------------------------------------------------------------------------------- /memory_opt/zeroCopy.cu: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * zero copy using in vectorAdd case. 4 | * 5 | * This demo code might be stale with the development of CUDA. 6 | * To use the latest API operations, you could see NVIDIA guide: 7 | * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html 8 | * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 9 | * 10 | * Author: kevin.xie 11 | * Email: kaiyuanxie@yeah.net 12 | */ 13 | 14 | #include "memoryOpt.h" 15 | #include "timer.h" 16 | 17 | __global__ void vectorAdd(const float *A, const float *B, float *C, const int numElements) 18 | { 19 | int i = blockDim.x * blockIdx.x + threadIdx.x; 20 | if (i < numElements) { 21 | C[i] = A[i] + B[i] + 0.0f; 22 | } 23 | } 24 | 25 | float vectorAddViaGlobalMemory(const unsigned int numElements, const unsigned int iterNum) 26 | { 27 | 28 | StopWatchInterface *timer = NULL; 29 | float elapsedTimeInMs = 0.0f; 30 | float throughputInGBs = 0.0f; 31 | 32 | sdkCreateTimer(&timer); 33 | size_t memSize = numElements * sizeof(float); 34 | 35 | // Launch the Vector Add CUDA Kernel 36 | int threadsPerBlock = 256; 37 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 38 | 39 | // Allocate the host input vector A, B, C 40 | float *h_A = (float *)malloc(memSize); 41 | float *h_B = (float *)malloc(memSize); 42 | float *h_C = (float *)malloc(memSize); 43 | 44 | // Verify that allocations succeeded 45 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 46 | fprintf(stderr, "Failed to allocate host vectors!\n"); 47 | exit(EXIT_FAILURE); 48 | } 49 | 50 | // Initialize the host input vectors 51 | for (int i = 0; i < numElements; ++i) { 52 | h_A[i] = rand() / (float)RAND_MAX; 53 | h_B[i] = rand() / (float)RAND_MAX; 54 | } 55 | 56 | // Allocate the device input vector: 57 | float *d_A = NULL; 58 | float *d_B = NULL; 59 | float *d_C = NULL; 60 | checkCudaErrors(cudaMalloc((void **)&d_A, memSize)); 61 | checkCudaErrors(cudaMalloc((void **)&d_B, memSize)); 62 | checkCudaErrors(cudaMalloc((void **)&d_C, memSize)); 63 | 64 | for (unsigned int i = 0; i < iterNum; i++) { 65 | sdkStartTimer(&timer); 66 | checkCudaErrors(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice)); 67 | checkCudaErrors(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice)); 68 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 69 | checkCudaErrors(cudaGetLastError()); 70 | // Copy the device result vector in device memory to the host result vector in host memory. 71 | checkCudaErrors(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost)); 72 | sdkStopTimer(&timer); 73 | elapsedTimeInMs += sdkGetTimerValue(&timer); 74 | sdkResetTimer(&timer); 75 | } 76 | 77 | // Verify that the result vector is correct 78 | for (int i = 0; i < numElements; ++i) { 79 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { 80 | fprintf(stderr, "Result verification failed at element %d!\n", i); 81 | exit(EXIT_FAILURE); 82 | } 83 | } 84 | 85 | // calculate throughput in GB/s. Note: use 1000(not 1024)unit. 86 | double time_s = elapsedTimeInMs / 1e3; 87 | throughputInGBs = (memSize * (float)iterNum) / (double)1e9; 88 | throughputInGBs = throughputInGBs / time_s; 89 | sdkDeleteTimer(&timer); 90 | 91 | // Free device global memory 92 | checkCudaErrors(cudaFree(d_A)); 93 | checkCudaErrors(cudaFree(d_B)); 94 | checkCudaErrors(cudaFree(d_C)); 95 | 96 | // Free host memory 97 | free(h_A); 98 | free(h_B); 99 | free(h_C); 100 | 101 | return throughputInGBs; 102 | } 103 | 104 | float vectorAddViaZeroCopy(const unsigned int numElements, const unsigned int iterNum) 105 | { 106 | 107 | StopWatchInterface *timer = NULL; 108 | float elapsedTimeInMs = 0.0f; 109 | float throughputInGBs = 0.0f; 110 | 111 | sdkCreateTimer(&timer); 112 | size_t memSize = numElements * sizeof(float); 113 | 114 | // Launch the Vector Add CUDA Kernel 115 | int threadsPerBlock = 256; 116 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 117 | 118 | checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); 119 | // Allocate the host input vector A, B, C 120 | float *h_A = NULL; 121 | float *h_B = NULL; 122 | float *h_C = NULL; 123 | float *map_A, *map_B, *map_C; 124 | // Policy1: 125 | // checkCudaErrors(cudaMallocHost((void **)&h_A, memSize)); 126 | // checkCudaErrors(cudaMallocHost((void **)&h_B, memSize)); 127 | // checkCudaErrors(cudaMallocHost((void **)&h_C, memSize)); 128 | 129 | // Policy2: 130 | checkCudaErrors(cudaHostAlloc((void **)&h_A, memSize, cudaHostAllocMapped)); 131 | checkCudaErrors(cudaHostAlloc((void **)&h_B, memSize, cudaHostAllocMapped)); 132 | checkCudaErrors(cudaHostAlloc((void **)&h_C, memSize, cudaHostAllocMapped)); 133 | 134 | // Verify that allocations succeeded 135 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 136 | fprintf(stderr, "Failed to allocate host vectors!\n"); 137 | exit(EXIT_FAILURE); 138 | } 139 | // Get the device pointers for the pinned CPU memory mapped into the GPU memory space. 140 | checkCudaErrors(cudaHostGetDevicePointer(&map_A, h_A, 0)); 141 | checkCudaErrors(cudaHostGetDevicePointer(&map_B, h_B, 0)); 142 | checkCudaErrors(cudaHostGetDevicePointer(&map_C, h_C, 0)); 143 | 144 | // Initialize the host input vectors 145 | for (int i = 0; i < numElements; ++i) { 146 | h_A[i] = rand() / (float)RAND_MAX; 147 | h_B[i] = rand() / (float)RAND_MAX; 148 | } 149 | 150 | // Copy the host input vectors A and B in host memory to the device input vectors in device memory 151 | for (unsigned int i = 0; i < iterNum; i++) { 152 | sdkStartTimer(&timer); 153 | vectorAdd<<>>(map_A, map_B, map_C, numElements); 154 | checkCudaErrors(cudaGetLastError()); 155 | // Copy the device result vector in device memory to the host result vector in host memory. 156 | sdkStopTimer(&timer); 157 | elapsedTimeInMs += sdkGetTimerValue(&timer); 158 | sdkResetTimer(&timer); 159 | } 160 | 161 | checkCudaErrors(cudaDeviceSynchronize()); 162 | // Verify that the result vector is correct 163 | for (int i = 0; i < numElements; ++i) { 164 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { 165 | fprintf(stderr, "Result verification failed at element %d!\n", i); 166 | exit(EXIT_FAILURE); 167 | } 168 | } 169 | 170 | // calculate throughput in GB/s. Note: use 1000(not 1024)unit. 171 | double time_s = elapsedTimeInMs / 1e3; 172 | throughputInGBs = (memSize * (float)iterNum) / (double)1e9; 173 | throughputInGBs = throughputInGBs / time_s; 174 | sdkDeleteTimer(&timer); 175 | 176 | // Free host memory 177 | checkCudaErrors(cudaFreeHost(h_A)); 178 | checkCudaErrors(cudaFreeHost(h_B)); 179 | checkCudaErrors(cudaFreeHost(h_C)); 180 | 181 | return throughputInGBs; 182 | } 183 | 184 | int main(int argc, char **argv) 185 | { 186 | printf("[Zero Copy Opt Vector Add] - Starting...\n"); 187 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 188 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 189 | printf(" -size=The size of numElements for testing in bytes. Default: 5000000)\n"); 190 | printf(" -iter=n Iteration numbers of trans. Default:1 \n"); 191 | exit(EXIT_SUCCESS); 192 | } 193 | unsigned int numElements = 5000000; 194 | unsigned int iterNumbers = 1; 195 | unsigned int gpuID = 0; 196 | 197 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) { 198 | gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 199 | } 200 | if (checkCmdLineFlag(argc, (const char **)argv, "size")) { 201 | numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size"); 202 | } 203 | 204 | if (checkCmdLineFlag(argc, (const char **)argv, "iter")) { 205 | iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter"); 206 | } 207 | 208 | checkCudaErrors(cudaSetDevice(gpuID)); 209 | cudaDeviceProp prop; 210 | cudaGetDeviceProperties(&prop, gpuID); 211 | if (!prop.canMapHostMemory) 212 | exit(EXIT_FAILURE); 213 | printf(">. Data tranfer via global memory. VectorAdd throughput: %f GB/s\n", 214 | vectorAddViaGlobalMemory(numElements, iterNumbers)); 215 | printf(">. Data tranfer via zero copy. VectorAdd throughput: %f GB/s\n", 216 | vectorAddViaZeroCopy(numElements, iterNumbers)); 217 | 218 | exit(EXIT_SUCCESS); 219 | } -------------------------------------------------------------------------------- /nccl/README.md: -------------------------------------------------------------------------------- 1 | # NCCL C++ Examples 2 | 3 | | **Cases** | **Node require** | **Description** | 4 | |----------------------------|------------------|-----------------------------------------------------------| 5 | | one_device_per_thread | 1 | One Device(1 GPU) per Process or Thread | 6 | | multi_devices_per_thread | 1 | Multiple Devices(more than one GPU) per Process or Thread | 7 | | nonblocking_double_streams | 1 | One rank has two communicators. | 8 | | nccl_with_mpi | 1 | Run with Open MPI | 9 | | node_server/node_client | 2 | Using socket for init | 10 | 11 | 12 | ## Compile 13 | 14 | Clone this git lib to your local env, such as /home/xky/ 15 | 16 | Requirements: 17 | * CUDA 18 | * NVIDIA NCCL (optimized for NVLink) 19 | * Open-MPI (option) 20 | 21 | Recommend using docker images: 22 | 23 | ```shell 24 | docker pull nvcr.io/nvidia/pytorch:24.07-py3 25 | ``` 26 | 27 | If there is docker-ce, run docker: 28 | ```shell 29 | sudo docker run --net=host --gpus=all -it -e UID=root --ipc host --shm-size="32g" \ 30 | -v /home/xky/:/home/xky \ 31 | -u 0 \ 32 | --name=nccl2 nvcr.io/nvidia/pytorch:24.07-py3 bash 33 | ``` 34 | Others: 35 | ```shell 36 | docker run \ 37 | --runtime=nvidia \ 38 | --privileged \ 39 | --device /dev/nvidia0:/dev/nvidia0 \ 40 | --device /dev/nvidia1:/dev/nvidia1 \ 41 | --device /dev/nvidia2:/dev/nvidia2 \ 42 | --device /dev/nvidia3:/dev/nvidia3 \ 43 | --device /dev/nvidia4:/dev/nvidia4 \ 44 | --device /dev/nvidia5:/dev/nvidia5 \ 45 | --device /dev/nvidia6:/dev/nvidia6 \ 46 | --device /dev/nvidia7:/dev/nvidia7 \ 47 | --device /dev/nvidiactl:/dev/nvidiactl \ 48 | --device /dev/nvidia-uvm:/dev/nvidia-uvm \ 49 | --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools \ 50 | --device /dev/infiniband:/dev/infiniband \ 51 | -v /usr/local/bin/:/usr/local/bin/ \ 52 | -v /opt/cloud/cce/nvidia/:/usr/local/nvidia/ \ 53 | -v /home/xky/:/home/xky \ 54 | --ipc host \ 55 | --net host \ 56 | -it \ 57 | -u root \ 58 | --name nccl_env \ 59 | nvcr.io/nvidia/pytorch:24.07-py3 bash 60 | ``` 61 | 62 | 63 | Enter the git directory and run makefile 64 | ```shell 65 | cd /home/xky/BasicCUDA/nccl/ 66 | make 67 | ``` 68 | If there is MPI lib in env, could compile MPI case: 69 | ```shell 70 | make mpi 71 | ``` 72 | 73 | ## Run 74 | 75 | ### Single node 76 | 77 | ```shell 78 | ./multi_devices_per_thread 79 | ./one_devices_per_thread 80 | ./nonblocking_double_streams 81 | ``` 82 | 83 | Set DEBUG=1 would print some debug information. 84 | Could change ranks number by set '--nranks'. e.g: 85 | 86 | ```shell 87 | DEBUG=1 ./nonblocking_double_streams --nranks 8 88 | ``` 89 | 90 | MPI case run: 91 | ```shell 92 | mpirun -n 6 --allow-run-as-root ./nccl_with_mpi 93 | ``` 94 | 95 | ### Multi nodes 96 | 97 | Two nodes case: using socket connection for nccl init. 98 | 99 | Server run in one: 100 | ```shell 101 | ./node_server 102 | ``` 103 | 104 | Client run in another one, e.g. Server IP: 10.10.1.1 105 | ```shell 106 | ./node_client --hostname 10.10.1.1 107 | ``` 108 | 109 | Add some envs: 110 | ```shell 111 | # server: 112 | NCCL_DEBUG=INFO NCCL_NET_PLUGIN=none NCCL_IB_DISABLE=1 ./node_server --port 8066 --nranks 8 113 | # client: 114 | NCCL_DEBUG=INFO NCCL_NET_PLUGIN=none NCCL_IB_DISABLE=1 ./node_client --hostname 10.10.1.1 --port 8066 --nranks 8 115 | ``` 116 | 117 | -------------------------------------------------------------------------------- /nccl/alltoall.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g alltoall.cu -o alltoall 3 | * Test: ./alltoall 4 | * Profiling: nvprof --csv -o profile_output.csv ./alltoall 5 | * Author: kevin.xie 6 | * Email: kaiyuanxie@yeah.net 7 | */ 8 | 9 | #include "comm.h" 10 | 11 | ncclUniqueId id; 12 | pthread_mutex_t mutex; 13 | 14 | void dataPrint(float *hostData, int size, int gpu_id, int my_nranks, const char *status) 15 | { 16 | pthread_mutex_lock(&mutex); 17 | printf("GPU:%d %s data: ", gpu_id, status); 18 | for (int i = 0; i < size; ++i) { 19 | printf("%.0f ", hostData[i]); 20 | } 21 | printf("\n"); 22 | pthread_mutex_unlock(&mutex); 23 | } 24 | 25 | ncclResult_t AlltoAll(const void *sendbuff, void *recvbuff, size_t count, ncclDataType_t type, ncclComm_t comm, 26 | cudaStream_t stream) 27 | { 28 | int nRanks; 29 | NCCLCHECK(ncclCommCount(comm, &nRanks)); 30 | size_t rankOffset = count * wordSize(type); 31 | 32 | #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 33 | printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); 34 | return ncclInternalError; 35 | #else 36 | NCCLCHECK(ncclGroupStart()); 37 | for (int r = 0; r < nRanks; r++) { 38 | NCCLCHECK(ncclSend(((char *)sendbuff) + r * rankOffset, count, type, r, comm, stream)); 39 | NCCLCHECK(ncclRecv(((char *)recvbuff) + r * rankOffset, count, type, r, comm, stream)); 40 | } 41 | NCCLCHECK(ncclGroupEnd()); 42 | return ncclSuccess; 43 | #endif 44 | } 45 | 46 | void *threadAlltoAll(void *arg) 47 | { 48 | int count = 1; 49 | int size = my_nranks * count; 50 | int gpu_id = *(int *)arg; 51 | cudaSetDevice(gpu_id); 52 | 53 | ncclComm_t comm; 54 | NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id)); 55 | 56 | float *sendbuff; 57 | float *recvbuff; 58 | float *hostData; 59 | cudaStream_t s; 60 | 61 | hostData = (float *)malloc(size * sizeof(float)); 62 | for (int i = 0; i < size; ++i) { 63 | // hostData[i] = float(gpu_id) * my_nranks + i; 64 | hostData[i] = float(gpu_id); 65 | } 66 | dataPrint(hostData, size, gpu_id, my_nranks, "input"); 67 | 68 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 69 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 70 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 71 | CUDACHECK(cudaStreamCreate(&s)); 72 | 73 | NCCLCHECK(AlltoAll((const void *)sendbuff, (void *)recvbuff, count, ncclFloat, comm, s)); 74 | // completing NCCL operation by synchronizing on the CUDA stream 75 | CUDACHECK(cudaStreamSynchronize(s)); 76 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 77 | dataPrint(hostData, size, gpu_id, my_nranks, "output"); 78 | ncclCommDestroy(comm); 79 | 80 | CUDACHECK(cudaFree(sendbuff)); 81 | CUDACHECK(cudaFree(recvbuff)); 82 | free(hostData); 83 | 84 | return NULL; 85 | } 86 | 87 | void *threadAlltoAllIter(void *arg) 88 | { 89 | int count = 2 * 1024 * 1024; 90 | int size = my_nranks * count; 91 | int gpu_id = *(int *)arg; 92 | cudaSetDevice(gpu_id); 93 | 94 | ncclComm_t comm; 95 | NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id)); 96 | 97 | float *sendbuff; 98 | float *recvbuff; 99 | float *hostData; 100 | cudaStream_t s; 101 | 102 | hostData = (float *)malloc(size * sizeof(float)); 103 | for (int i = 0; i < size; ++i) { 104 | // hostData[i] = float(gpu_id) * my_nranks + i; 105 | hostData[i] = float(gpu_id); 106 | } 107 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 108 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 109 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 110 | CUDACHECK(cudaStreamCreate(&s)); 111 | 112 | for (int i = 0; i < 4; ++i) { 113 | NCCLCHECK(AlltoAll((const void *)sendbuff, (void *)recvbuff, count, ncclFloat, comm, s)); 114 | // Sync stream to avoid data chaos. 115 | CUDACHECK(cudaStreamSynchronize(s)); 116 | } 117 | 118 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 119 | ncclCommDestroy(comm); 120 | 121 | CUDACHECK(cudaFree(sendbuff)); 122 | CUDACHECK(cudaFree(recvbuff)); 123 | free(hostData); 124 | return NULL; 125 | } 126 | 127 | void runAlltoAll(ops threadFunc) 128 | { 129 | pthread_t threads[8]; 130 | printf("====== AlltoAll case begin =====\n"); 131 | NCCLCHECK(ncclGetUniqueId(&id)); 132 | for (int i = 0; i < my_nranks; ++i) { 133 | int *id_pointer = &gpu_ids[i]; 134 | pthread_create(&threads[i], NULL, threadFunc, id_pointer); 135 | } 136 | 137 | for (int i = 0; i < my_nranks; ++i) { 138 | pthread_join(threads[i], NULL); 139 | } 140 | printf("====== AlltoAll case end =====\n\n"); 141 | } 142 | 143 | ncclResult_t AlltoAllSplit(const void *sendbuff, void *recvbuff, const size_t *sendSplitList, 144 | const size_t *recvSplitList, ncclDataType_t type, ncclComm_t comm, cudaStream_t stream) 145 | { 146 | int nRanks; 147 | NCCLCHECK(ncclCommCount(comm, &nRanks)); 148 | size_t sendOffset = 0; 149 | size_t recvOffset = 0; 150 | NCCLCHECK(ncclGroupStart()); 151 | for (int r = 0; r < nRanks; r++) { 152 | NCCLCHECK(ncclSend(((char *)sendbuff) + sendOffset, sendSplitList[r], type, r, comm, stream)); 153 | NCCLCHECK(ncclRecv(((char *)recvbuff) + recvOffset, recvSplitList[r], type, r, comm, stream)); 154 | sendOffset += wordSize(type) * sendSplitList[r]; 155 | recvOffset += wordSize(type) * recvSplitList[r]; 156 | } 157 | NCCLCHECK(ncclGroupEnd()); 158 | return ncclSuccess; 159 | } 160 | 161 | const int countTotal = 15; 162 | const size_t sendArray[4][4] = {{1, 2, 3, 4}, {4, 2, 3, 1}, {3, 2, 1, 4}, {2, 3, 4, 1}}; 163 | 164 | const size_t recvArray[4][4] = {{1, 4, 3, 2}, {2, 2, 2, 3}, {3, 3, 1, 4}, {4, 1, 4, 1}}; 165 | 166 | /* 167 | input data: 168 | GPU:0 : 0 0 0 0 0 0 0 0 0 0 169 | GPU:1 : 1 1 1 1 1 1 1 1 1 1 170 | GPU:2 : 2 2 2 2 2 2 2 2 2 2 171 | GPU:3 : 3 3 3 3 3 3 3 3 3 3 172 | 173 | split array: 174 | sendArray set to: 175 | {{1, 2, 3, 4}, 176 | {4, 2, 3, 1}, 177 | {3, 2, 1, 4}, 178 | {2, 3, 4, 1}}; 179 | recvArray is equals to transpose(sendArray): 180 | {{1, 4, 3, 2}, 181 | {2, 2, 2, 3}, 182 | {3, 3, 1, 4}, 183 | {4, 1, 4, 1}}; 184 | 185 | output data: 186 | GPU:0 : 0 1 1 1 1 2 2 2 3 3 187 | GPU:1 : 0 0 1 1 2 2 3 3 3 188 | GPU:2 : 0 0 0 1 1 1 2 3 3 3 3 189 | GPU:3 : 0 0 0 0 1 2 2 2 2 3 190 | 191 | 192 | */ 193 | void *threadAlltoAllSplit(void *arg) 194 | { 195 | int size = countTotal; 196 | int gpu_id = *(int *)arg; 197 | cudaSetDevice(gpu_id); 198 | 199 | ncclComm_t comm; 200 | NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id)); 201 | 202 | float *sendbuff; 203 | float *recvbuff; 204 | float *hostData; 205 | cudaStream_t s; 206 | 207 | hostData = (float *)malloc(size * sizeof(float)); 208 | 209 | for (int i = 0; i < size; ++i) { 210 | hostData[i] = float(gpu_id); 211 | } 212 | int sendDataNum = 0; 213 | int recvDataNum = 0; 214 | for (int i = 0; i < 4; ++i) { 215 | sendDataNum += sendArray[gpu_id][i]; 216 | recvDataNum += recvArray[gpu_id][i]; 217 | } 218 | 219 | dataPrint(hostData, sendDataNum, gpu_id, my_nranks, "input"); 220 | 221 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 222 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 223 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 224 | CUDACHECK(cudaStreamCreate(&s)); 225 | 226 | NCCLCHECK(AlltoAllSplit((const void *)sendbuff, (void *)recvbuff, sendArray[gpu_id], recvArray[gpu_id], ncclFloat, 227 | comm, s)); 228 | // completing NCCL operation by synchronizing on the CUDA stream 229 | CUDACHECK(cudaStreamSynchronize(s)); 230 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 231 | dataPrint(hostData, recvDataNum, gpu_id, my_nranks, "output"); 232 | ncclCommDestroy(comm); 233 | CUDACHECK(cudaFree(sendbuff)); 234 | CUDACHECK(cudaFree(recvbuff)); 235 | free(hostData); 236 | return NULL; 237 | } 238 | 239 | void runAlltoAllSplit() 240 | { 241 | pthread_t threads[4]; 242 | NCCLCHECK(ncclGetUniqueId(&id)); 243 | if (my_nranks < 4) { 244 | printf("AlltoAllSplit demo requires nranks>=4, but got %d.\n", my_nranks); 245 | exit(-1); 246 | } 247 | // only support 4 ranks demo. 248 | my_nranks = 4; 249 | printf("====== AlltoAllSplit case begin =====\n"); 250 | for (int i = 0; i < my_nranks; ++i) { 251 | int *id_pointer = &gpu_ids[i]; 252 | pthread_create(&threads[i], NULL, threadAlltoAllSplit, id_pointer); 253 | } 254 | for (int i = 0; i < my_nranks; ++i) { 255 | pthread_join(threads[i], NULL); 256 | } 257 | printf("====== AlltoAllSplit case end =====\n\n"); 258 | } 259 | 260 | int main(int argc, char *argv[]) 261 | { 262 | env_init(argc, argv); 263 | runAlltoAll(threadAlltoAll); 264 | runAlltoAll(threadAlltoAllIter); 265 | runAlltoAllSplit(); 266 | printf("Finished successfully.\n"); 267 | return 0; 268 | } 269 | -------------------------------------------------------------------------------- /nccl/comm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "cuda_runtime.h" 15 | #include "nccl.h" 16 | 17 | #define CUDACHECK(cmd) \ 18 | do { \ 19 | cudaError_t err = cmd; \ 20 | if (err != cudaSuccess) { \ 21 | printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 22 | exit(EXIT_FAILURE); \ 23 | } \ 24 | } while (0) 25 | 26 | #define NCCLCHECK(cmd) \ 27 | do { \ 28 | ncclResult_t res = cmd; \ 29 | if (res != ncclSuccess) { \ 30 | printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(res)); \ 31 | exit(EXIT_FAILURE); \ 32 | } \ 33 | } while (0) 34 | 35 | #define DEFAULT_DEVICES_NUM 8 36 | int gpu_ids[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 37 | char *if_debug = nullptr; 38 | std::string server_hostname = "127.0.0.1"; 39 | int server_port = 8099; 40 | int my_nranks = 6; 41 | 42 | typedef void *(*ops)(void *); 43 | 44 | #define DEBUG_PRINT(info) \ 45 | if (if_debug && strcasecmp(if_debug, "0") != 0) { \ 46 | printf("DEUBG INFO: %s\n", info); \ 47 | } 48 | 49 | const std::string help_info = "Usage: --nranks The number of ranks/GPU\n\ 50 | --hostname Server IP address.\n\ 51 | --port To specify a port. Default: 8099 \n\ 52 | E.g. ./run --nranks 4 --port 8096\n"; 53 | 54 | void env_init(int argc, char* argv[]) 55 | { 56 | if_debug = getenv("DEBUG"); 57 | std::map options; 58 | const std::set allow_options{"--nranks", "--hostname", "--port"}; 59 | 60 | for (int i = 1; i < argc; ++i) { 61 | std::string arg = argv[i]; 62 | 63 | if (arg.substr(0, 2) == "--") { 64 | std::string value; 65 | if (i + 1 < argc && argv[i + 1][0] != '-') { 66 | value = argv[i + 1]; 67 | i++; 68 | } 69 | options[arg] = value; 70 | } else { 71 | std::cout << "Unknown option: " << arg << std::endl; 72 | std::cout << help_info << std::endl; 73 | exit(-1); 74 | } 75 | } 76 | 77 | for (const auto &opt : options) { 78 | if (allow_options.find(opt.first) == allow_options.end()) { 79 | std::cout << "Unknown option: " << opt.first << std::endl << help_info; 80 | exit(-1); 81 | } 82 | } 83 | 84 | if (options.find("--nranks") != options.end()) { 85 | std::cout << "Local rank size: " << options["--nranks"] << std::endl; 86 | my_nranks = std::stoi(options["--nranks"]); 87 | } 88 | if (options.find("--hostname") != options.end()) { 89 | std::cout << "The hostname: " << options["--hostname"] << std::endl; 90 | server_hostname = options["--hostname"]; 91 | } 92 | if (options.find("--port") != options.end()) { 93 | std::cout << "The hostport: " << options["--port"] << std::endl; 94 | server_port = std::stoi(options["--port"]); 95 | } 96 | } 97 | 98 | static size_t wordSize(ncclDataType_t type) { 99 | switch(type) { 100 | case ncclChar: 101 | #if NCCL_MAJOR >= 2 102 | //case ncclInt8: 103 | case ncclUint8: 104 | #endif 105 | return 1; 106 | case ncclHalf: 107 | #if defined(__CUDA_BF16_TYPES_EXIST__) 108 | case ncclBfloat16: 109 | #endif 110 | //case ncclFloat16: 111 | return 2; 112 | case ncclInt: 113 | case ncclFloat: 114 | #if NCCL_MAJOR >= 2 115 | //case ncclInt32: 116 | case ncclUint32: 117 | //case ncclFloat32: 118 | #endif 119 | return 4; 120 | case ncclInt64: 121 | case ncclUint64: 122 | case ncclDouble: 123 | //case ncclFloat64: 124 | return 8; 125 | default: return 0; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /nccl/multi_devices_per_thread.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html 3 | * Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g multi_devices_per_thread.cu -o multi_devices_per_thread 4 | */ 5 | 6 | #include "comm.h" 7 | 8 | int main(int argc, char *argv[]) 9 | { 10 | ncclComm_t comms[DEFAULT_DEVICES_NUM]; 11 | int nDev = DEFAULT_DEVICES_NUM; 12 | int size = 1024 * 1024; 13 | 14 | // allocating and initializing device buffers 15 | float **sendbuff = (float **)malloc(nDev * sizeof(float *)); 16 | float **recvbuff = (float **)malloc(nDev * sizeof(float *)); 17 | cudaStream_t *s = (cudaStream_t *)malloc(sizeof(cudaStream_t) * nDev); 18 | 19 | for (int i = 0; i < nDev; ++i) { 20 | CUDACHECK(cudaSetDevice(i)); 21 | CUDACHECK(cudaMalloc((void **)sendbuff + i, size * sizeof(float))); 22 | CUDACHECK(cudaMalloc((void **)recvbuff + i, size * sizeof(float))); 23 | CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float))); 24 | CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float))); 25 | CUDACHECK(cudaStreamCreate(s + i)); 26 | } 27 | 28 | // initializing NCCL 29 | NCCLCHECK(ncclCommInitAll(comms, nDev, gpu_ids)); 30 | 31 | // calling NCCL communication API. Group API is required when using 32 | // multiple devices per thread 33 | NCCLCHECK(ncclGroupStart()); 34 | for (int i = 0; i < nDev; ++i) 35 | NCCLCHECK( 36 | ncclAllReduce((const void *)sendbuff[i], (void *)recvbuff[i], size, ncclFloat, ncclSum, comms[i], s[i])); 37 | NCCLCHECK(ncclGroupEnd()); 38 | 39 | // synchronizing on CUDA streams to wait for completion of NCCL operation 40 | for (int i = 0; i < nDev; ++i) { 41 | CUDACHECK(cudaSetDevice(i)); 42 | CUDACHECK(cudaStreamSynchronize(s[i])); 43 | } 44 | 45 | // free device buffers 46 | for (int i = 0; i < nDev; ++i) { 47 | CUDACHECK(cudaSetDevice(i)); 48 | CUDACHECK(cudaFree(sendbuff[i])); 49 | CUDACHECK(cudaFree(recvbuff[i])); 50 | } 51 | 52 | // finalizing NCCL 53 | for (int i = 0; i < nDev; ++i) 54 | ncclCommDestroy(comms[i]); 55 | 56 | printf("Success \n"); 57 | return 0; 58 | } -------------------------------------------------------------------------------- /nccl/nccl_with_mpi.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Use MPI to connect nccl. 3 | * Source code: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html 4 | * Compile: nvcc -lmpi -lnccl -I/usr/local/mpi/include/ -L/usr/local/mpi/lib/ -ccbin g++ -std=c++11 -O3 -g nccl_with_mpi.cu -o test 5 | * Test: mpirun -n 6 --allow-run-as-root ./nccl_with_mpi 6 | */ 7 | 8 | #include "comm.h" 9 | #include "mpi.h" 10 | 11 | #define MPICHECK(cmd) \ 12 | do { \ 13 | int e = cmd; \ 14 | if (e != MPI_SUCCESS) { \ 15 | printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ 16 | exit(EXIT_FAILURE); \ 17 | } \ 18 | } while (0) 19 | 20 | static uint64_t getHostHash(const char *string) 21 | { 22 | // Based on DJB2a, result = result * 33 ^ char 23 | uint64_t result = 5381; 24 | for (int c = 0; string[c] != '\0'; c++) { 25 | result = ((result << 5) + result) ^ string[c]; 26 | } 27 | return result; 28 | } 29 | 30 | static void getHostName(char *hostname, int maxlen) 31 | { 32 | gethostname(hostname, maxlen); 33 | for (int i = 0; i < maxlen; i++) { 34 | if (hostname[i] == '.') { 35 | hostname[i] = '\0'; 36 | return; 37 | } 38 | } 39 | } 40 | 41 | int main(int argc, char *argv[]) 42 | { 43 | int size = 32 * 1024 * 1024; 44 | 45 | int myRank, nRanks, localRank = 0; 46 | 47 | // initializing MPI 48 | MPICHECK(MPI_Init(&argc, &argv)); 49 | MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); 50 | MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); 51 | 52 | // calculating localRank based on hostname which is used in selecting a GPU 53 | uint64_t hostHashs[nRanks]; 54 | char hostname[1024]; 55 | getHostName(hostname, 1024); 56 | hostHashs[myRank] = getHostHash(hostname); 57 | MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); 58 | for (int p = 0; p < nRanks; p++) { 59 | if (p == myRank) 60 | break; 61 | if (hostHashs[p] == hostHashs[myRank]) 62 | localRank++; 63 | } 64 | 65 | ncclUniqueId id; 66 | ncclComm_t comm; 67 | float *sendbuff, *recvbuff; 68 | cudaStream_t s; 69 | 70 | // get NCCL unique ID at rank 0 and broadcast it to all others 71 | if (myRank == 0) 72 | ncclGetUniqueId(&id); 73 | MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); 74 | 75 | // picking a GPU based on localRank, allocate device buffers 76 | CUDACHECK(cudaSetDevice(localRank)); 77 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 78 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 79 | CUDACHECK(cudaStreamCreate(&s)); 80 | 81 | // initializing NCCL 82 | NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank)); 83 | 84 | // communicating using NCCL 85 | NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s)); 86 | 87 | // completing NCCL operation by synchronizing on the CUDA stream 88 | CUDACHECK(cudaStreamSynchronize(s)); 89 | 90 | // free device buffers 91 | CUDACHECK(cudaFree(sendbuff)); 92 | CUDACHECK(cudaFree(recvbuff)); 93 | 94 | // finalizing NCCL 95 | ncclCommDestroy(comm); 96 | 97 | // finalizing MPI 98 | MPICHECK(MPI_Finalize()); 99 | 100 | printf("[MPI Rank %d] Success \n", myRank); 101 | return 0; 102 | } -------------------------------------------------------------------------------- /nccl/node_client.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Implement two nodes communication via socket init. 3 | * Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g node_client.cu -o node_client 4 | * Author: kevin.xie 5 | * Email: kaiyuanxie@yeah.net 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "comm.h" 13 | 14 | ncclUniqueId id; 15 | 16 | void *thread_function(void *arg) 17 | { 18 | int size = 32 * 1024; 19 | int gpu_id = *(int *)arg; 20 | cudaSetDevice(gpu_id); 21 | 22 | ncclComm_t comm; 23 | if (if_debug) 24 | std::cout << "Received from server: " << id.internal << std::endl; // debug 25 | 26 | NCCLCHECK(ncclCommInitRank(&comm, my_nranks*2, id, gpu_id + my_nranks)); 27 | 28 | float *sendbuff; 29 | float *recvbuff; 30 | float *hostData; 31 | cudaStream_t s; 32 | 33 | hostData = (float *)malloc(size * sizeof(float)); 34 | for (int i = 0; i < size; ++i) { 35 | hostData[i] = float(gpu_id); 36 | } 37 | 38 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 39 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 40 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 41 | CUDACHECK(cudaStreamCreate(&s)); 42 | 43 | NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s)); 44 | DEBUG_PRINT("============ncclAllReduce == end=====.\n"); 45 | NCCLCHECK(ncclBroadcast((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, 0, comm, s)); 46 | DEBUG_PRINT("============ncclBroadcast == end=====.\n"); 47 | 48 | // completing NCCL operation by synchronizing on the CUDA stream 49 | CUDACHECK(cudaStreamSynchronize(s)); 50 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 51 | printf("GPU:%d data: %f.\n", gpu_id, hostData[1]); 52 | 53 | ncclCommDestroy(comm); 54 | 55 | CUDACHECK(cudaFree(sendbuff)); 56 | CUDACHECK(cudaFree(recvbuff)); 57 | free(hostData); 58 | 59 | return NULL; 60 | } 61 | 62 | int main(int argc, char *argv[]) 63 | { 64 | env_init(argc, argv); 65 | int sock; 66 | struct sockaddr_in server_addr; 67 | const char *message = "Hello, server!"; 68 | 69 | sock = socket(AF_INET, SOCK_STREAM, 0); 70 | if (sock < 0) { 71 | std::cerr << "Cannot create socket" << std::endl; 72 | return 1; 73 | } 74 | 75 | server_addr.sin_family = AF_INET; 76 | server_addr.sin_port = htons(server_port); 77 | inet_pton(AF_INET, server_hostname.c_str(), &server_addr.sin_addr); 78 | 79 | if (connect(sock, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) { 80 | std::cerr << "Cannot connect to the server" << std::endl; 81 | return 1; 82 | } 83 | 84 | std::cout << "Connected to the server" << std::endl; 85 | 86 | if (send(sock, message, strlen(message), 0) < 0) { 87 | std::cerr << "Cannot send message" << std::endl; 88 | return 1; 89 | } 90 | 91 | ssize_t recv_size = recv(sock, id.internal, 128, 0); 92 | if (recv_size > 0 && if_debug) { 93 | std::cout << "Received from server: " << id.internal << std::endl; 94 | } 95 | close(sock); 96 | 97 | pthread_t threads[8]; 98 | for (int i = 0; i < my_nranks; ++i) { 99 | int *id_pointer = &gpu_ids[i]; 100 | pthread_create(&threads[i], NULL, thread_function, id_pointer); 101 | } 102 | 103 | for (int i = 0; i < my_nranks; ++i) { 104 | pthread_join(threads[i], NULL); 105 | } 106 | 107 | printf("All threads finished successfully.\n"); 108 | 109 | return 0; 110 | } -------------------------------------------------------------------------------- /nccl/node_server.cu: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Implement two nodes communication via socket init. 4 | * Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g node_server.cu -o node_server 5 | * Author: kevin.xie 6 | * Email: kaiyuanxie@yeah.net 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "comm.h" 14 | 15 | ncclUniqueId id; 16 | 17 | void *thread_function(void *arg) 18 | { 19 | int size = 32 * 1024; 20 | int gpu_id = *(int *)arg; 21 | cudaSetDevice(gpu_id); 22 | 23 | ncclComm_t comm; 24 | NCCLCHECK(ncclCommInitRank(&comm, my_nranks*2, id, gpu_id)); 25 | DEBUG_PRINT("============ncclCommInitRank: init end.=============\n"); // debug 26 | 27 | float *sendbuff; 28 | float *recvbuff; 29 | float *hostData; 30 | cudaStream_t s; 31 | 32 | hostData = (float *)malloc(size * sizeof(float)); 33 | for (int i = 0; i < size; ++i) { 34 | hostData[i] = float(gpu_id); 35 | } 36 | 37 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 38 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 39 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 40 | CUDACHECK(cudaStreamCreate(&s)); 41 | 42 | NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s)); 43 | DEBUG_PRINT("============ncclAllReduce ===== end =====.\n"); 44 | 45 | NCCLCHECK(ncclBroadcast((const void *)recvbuff, (void *)recvbuff, size, ncclFloat, 0, comm, s)); 46 | DEBUG_PRINT("============ncclBroadcast ===== end =====.\n"); 47 | 48 | // completing NCCL operation by synchronizing on the CUDA stream 49 | CUDACHECK(cudaStreamSynchronize(s)); 50 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 51 | printf("GPU:%d data: %f.\n", gpu_id, hostData[1]); 52 | 53 | CUDACHECK(cudaStreamSynchronize(s)); 54 | ncclCommDestroy(comm); 55 | 56 | CUDACHECK(cudaFree(sendbuff)); 57 | CUDACHECK(cudaFree(recvbuff)); 58 | free(hostData); 59 | 60 | return NULL; 61 | } 62 | 63 | int main(int argc, char *argv[]) 64 | { 65 | env_init(argc, argv); 66 | int server_socket, client_socket; 67 | struct sockaddr_in server_addr, client_addr; 68 | socklen_t client_len = sizeof(client_addr); 69 | char buffer[1024]; 70 | 71 | server_socket = socket(AF_INET, SOCK_STREAM, 0); 72 | if (server_socket < 0) { 73 | std::cerr << "Cannot create socket" << std::endl; 74 | return 1; 75 | } 76 | 77 | int opt = 1; 78 | setsockopt(server_socket, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); 79 | 80 | server_addr.sin_family = AF_INET; 81 | server_addr.sin_addr.s_addr = INADDR_ANY; 82 | server_addr.sin_port = htons(server_port); 83 | 84 | if (bind(server_socket, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) { 85 | std::cerr << "Cannot bind" << std::endl; 86 | return 1; 87 | } 88 | 89 | if (listen(server_socket, 5) < 0) { 90 | std::cerr << "Cannot listen" << std::endl; 91 | return 1; 92 | } 93 | 94 | std::cout << "Server is listening on port " << server_port << std::endl; 95 | 96 | client_socket = accept(server_socket, (struct sockaddr *)&client_addr, &client_len); 97 | if (client_socket < 0) { 98 | std::cerr << "Cannot accept connection" << std::endl; 99 | return 1; 100 | } 101 | 102 | std::cout << "Accepted connection from " << inet_ntoa(client_addr.sin_addr) << std::endl; 103 | ssize_t recv_size = recv(client_socket, buffer, sizeof(buffer), 0); 104 | if (recv_size > 0) { 105 | buffer[recv_size] = '\0'; 106 | std::cout << "Received message: " << buffer << std::endl; 107 | } 108 | 109 | pthread_t threads[8]; 110 | 111 | NCCLCHECK(ncclGetUniqueId(&id)); 112 | if (if_debug) 113 | std::cout << "=================ncclGetUniqueId================" << buffer << std::endl; // debug 114 | 115 | if (send(client_socket, id.internal, 128, 0) < 0) { 116 | std::cerr << "Cannot send message to the client" << std::endl; 117 | } 118 | 119 | close(client_socket); 120 | close(server_socket); 121 | 122 | for (int i = 0; i < my_nranks; ++i) { 123 | int *id_pointer = &gpu_ids[i]; 124 | pthread_create(&threads[i], NULL, thread_function, id_pointer); 125 | } 126 | 127 | for (int i = 0; i < my_nranks; ++i) { 128 | pthread_join(threads[i], NULL); 129 | } 130 | 131 | printf("Server finished successfully.\n"); 132 | return 0; 133 | } -------------------------------------------------------------------------------- /nccl/nonblocking_double_streams.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Implement an non-blocking example of overlapping communication 3 | * Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g nonblocking_double_streams.cu -o nonblocking_double_streams 4 | * Author: kevin.xie 5 | * Email: kaiyuanxie@yeah.net 6 | */ 7 | 8 | #include 9 | 10 | #include "comm.h" 11 | 12 | void *allReduceOps(void *args); 13 | 14 | struct ThreadArgs { 15 | int gpu_id; 16 | int global_size; 17 | ncclUniqueId *id; 18 | int uuid = -1; 19 | ThreadArgs(int gpu_id, int global_size, ncclUniqueId *id) 20 | : gpu_id(gpu_id) 21 | , id(id) 22 | , global_size(global_size) {}; 23 | ThreadArgs(int gpu_id, int global_size, ncclUniqueId *id, int uuid) 24 | : gpu_id(gpu_id) 25 | , id(id) 26 | , global_size(global_size) 27 | , uuid(uuid) {}; 28 | }; 29 | 30 | bool cmpID(ncclUniqueId *id1, ncclUniqueId *id2) 31 | { 32 | if (memcmp(id1->internal, id2->internal, sizeof(id2->internal)) == 0) { 33 | printf("id1:%p is same with id2:%p\n", id1, id2); 34 | return false; 35 | } else { 36 | for (int i = 0; i < 128; i++) { 37 | char id1_ch = (id1->internal)[i]; 38 | char id2_ch = (id2->internal)[i]; 39 | if (id1_ch != id2_ch) 40 | printf("Id diff internal idx_%d: id1:%c id2:%c\n", i, id1_ch, id2_ch); 41 | } 42 | return true; 43 | } 44 | } 45 | 46 | void printTimestamp(int gpu_id, int uuid, const char *s) 47 | { 48 | struct timeval now; 49 | struct tm timeinfo; 50 | if (gettimeofday(&now, NULL) == -1) { 51 | perror("gettimeofday"); 52 | } 53 | localtime_r(&(now.tv_sec), &timeinfo); 54 | char time_string[80]; 55 | strftime(time_string, sizeof(time_string), "%Y-%m-%d %H:%M:%S", &timeinfo); 56 | 57 | char time_string_with_ms[100]; 58 | snprintf(time_string_with_ms, sizeof(time_string_with_ms), "%s.%03ld", time_string, (long)now.tv_usec / 1000); 59 | printf("Group: %d GPU idx: %d. The %s time: %s\n", uuid, gpu_id, s, time_string_with_ms); 60 | } 61 | 62 | class CommExec { 63 | int gpu_nums; 64 | cudaStream_t s; 65 | pthread_t threads[8]; 66 | bool end_flag = true; 67 | 68 | public: 69 | ncclUniqueId *id_ref; 70 | CommExec(int gpu_nums) 71 | : gpu_nums(gpu_nums) 72 | { 73 | } 74 | 75 | void launch(ncclUniqueId &id, ops func) 76 | { 77 | NCCLCHECK(ncclGetUniqueId(&id)); 78 | id_ref = &id; 79 | for (int i = 0; i < gpu_nums; i++) { 80 | ThreadArgs *args = new ThreadArgs(i, gpu_nums, &id); 81 | pthread_create(&threads[i], NULL, func, (void *)args); 82 | } 83 | end_flag = false; 84 | } 85 | 86 | void launch(ncclUniqueId &id, int uuid, ops func) 87 | { 88 | NCCLCHECK(ncclGetUniqueId(&id)); 89 | id_ref = &id; 90 | for (int i = 0; i < gpu_nums; i++) { 91 | ThreadArgs *args = new ThreadArgs(i, gpu_nums, &id, uuid); 92 | pthread_create(&threads[i], NULL, func, (void *)args); 93 | } 94 | end_flag = false; 95 | } 96 | 97 | ncclUniqueId *get() 98 | { 99 | return id_ref; 100 | } 101 | 102 | void wait() 103 | { 104 | 105 | for (int i = 0; i < gpu_nums; ++i) { 106 | pthread_join(threads[i], NULL); 107 | } 108 | end_flag = true; 109 | } 110 | 111 | ~CommExec() 112 | { 113 | if (!end_flag) 114 | wait(); 115 | } 116 | }; 117 | 118 | void *allReduceOps(void *args) 119 | { 120 | size_t size = 2e9; 121 | ThreadArgs *threadArgs = (struct ThreadArgs *)args; 122 | int gpu_id = threadArgs->gpu_id; 123 | cudaSetDevice(gpu_id); 124 | ncclComm_t comm; 125 | ncclResult_t state; 126 | ncclConfig_t config = NCCL_CONFIG_INITIALIZER; 127 | config.blocking = 0; 128 | 129 | ncclCommInitRankConfig(&comm, threadArgs->global_size, *(threadArgs->id), gpu_id, &config); 130 | do { 131 | NCCLCHECK(ncclCommGetAsyncError(comm, &state)); 132 | } while (state == ncclInProgress); 133 | 134 | float *sendbuff; 135 | float *recvbuff; 136 | float *hostData; 137 | cudaStream_t s; 138 | 139 | hostData = (float *)malloc(size * sizeof(float)); 140 | for (int i = 0; i < 20; ++i) { 141 | hostData[i] = float(gpu_id); 142 | } 143 | 144 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 145 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 146 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 147 | 148 | CUDACHECK(cudaStreamCreate(&s)); 149 | CUDACHECK(cudaDeviceSynchronize()); 150 | if (if_debug) 151 | printTimestamp(gpu_id, threadArgs->uuid, "start"); 152 | NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s)); 153 | // In non-blocking mode, the elapsed time has no reference. 154 | if (if_debug) 155 | printTimestamp(gpu_id, threadArgs->uuid, "first iter end"); 156 | 157 | for (int i = 0; i < 50; ++i) 158 | NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s)); 159 | 160 | // completing NCCL operation by synchronizing on the CUDA stream 161 | CUDACHECK(cudaStreamSynchronize(s)); 162 | if (if_debug) 163 | printTimestamp(gpu_id, threadArgs->uuid, "end"); 164 | 165 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 166 | printf("GPU:%d data: %f.\n", gpu_id, hostData[1]); 167 | 168 | ncclCommDestroy(comm); 169 | CUDACHECK(cudaFree(sendbuff)); 170 | CUDACHECK(cudaFree(recvbuff)); 171 | free(hostData); 172 | return NULL; 173 | } 174 | 175 | int main(int argc, char *argv[]) 176 | { 177 | env_init(argc, argv); 178 | ncclUniqueId id1; 179 | ncclUniqueId id2; 180 | CommExec commexec1(my_nranks); 181 | CommExec commexec2(my_nranks); 182 | commexec1.launch(id1, 1, allReduceOps); 183 | commexec2.launch(id2, 2, allReduceOps); 184 | if (if_debug) 185 | cmpID(commexec1.get(), commexec2.get()); 186 | commexec2.wait(); 187 | commexec1.wait(); 188 | printf("All streams finished successfully.\n"); 189 | return 0; 190 | } 191 | -------------------------------------------------------------------------------- /nccl/one_device_per_thread.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html 3 | * Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g one_devices_per_thread.cu.cu -o one_devices_per_thread 4 | */ 5 | 6 | #include "comm.h" 7 | 8 | ncclUniqueId id; 9 | 10 | void *thread_function(void *arg) 11 | { 12 | int size = 32 * 1024 * 1024; 13 | int gpu_id = *(int *)arg; 14 | cudaSetDevice(gpu_id); 15 | 16 | ncclComm_t comm; 17 | NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id)); 18 | 19 | float *sendbuff; 20 | float *recvbuff; 21 | float *hostData; 22 | cudaStream_t s; 23 | 24 | hostData = (float *)malloc(size * sizeof(float)); 25 | for (int i = 0; i < size; ++i) { 26 | hostData[i] = float(gpu_id); 27 | } 28 | 29 | CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float))); 30 | CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float))); 31 | cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice); 32 | CUDACHECK(cudaStreamCreate(&s)); 33 | 34 | NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s)); 35 | cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost); 36 | printf("GPU:%d data: %f.\n", gpu_id, hostData[1]); 37 | 38 | // completing NCCL operation by synchronizing on the CUDA stream 39 | CUDACHECK(cudaStreamSynchronize(s)); 40 | ncclCommDestroy(comm); 41 | 42 | CUDACHECK(cudaFree(sendbuff)); 43 | CUDACHECK(cudaFree(recvbuff)); 44 | free(hostData); 45 | 46 | return NULL; 47 | } 48 | 49 | int main(int argc, char *argv[]) 50 | { 51 | env_init(argc, argv); 52 | pthread_t threads[8]; 53 | NCCLCHECK(ncclGetUniqueId(&id)); 54 | for (int i = 0; i < my_nranks; ++i) { 55 | int *id_pointer = &gpu_ids[i]; 56 | pthread_create(&threads[i], NULL, thread_function, id_pointer); 57 | } 58 | 59 | for (int i = 0; i < my_nranks; ++i) { 60 | pthread_join(threads[i], NULL); 61 | } 62 | 63 | printf("Finished successfully.\n"); 64 | return 0; 65 | } -------------------------------------------------------------------------------- /pytorch/torch1.13_mem_rationale/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Memory Cuda Allocator Test 2 | *Objective of this Submodule*: Compiling and executing the cudaCachingAllocator derived from the PyTorch source code presents a noteworthy challenge, especially when attempting to test individual segments of the c10 cuda components. 3 | The primary aim of this submodule is to distill and streamline the source code to ensure it can be effortlessly executed within various testing frameworks. 4 | 5 | ## Comilple & Run 6 | 7 | Compile: 8 | ``` 9 | ./make 10 | ``` 11 | You would get an "allocator_test" exe file, then run it. 12 | 13 | ``` 14 | ./allocator_test 15 | ``` 16 | 17 | Add GPU architecture: change SM in Makefile: L255 18 | 19 | ```shell 20 | SMS ?= 35 37 50 52 60 61 70 75 80 86 90 21 | ``` 22 | eg. SMS=80 means supports A100/A800 23 | 24 | SMS=90, H100/H800 25 | 26 | 27 | Note: Your version CUDA nvcc might get "unsupported gpu architecture 'compute_35'" error. Delete SMS 35. 28 | -------------------------------------------------------------------------------- /pytorch/torch1.13_mem_rationale/TestAllocator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * The cuda caching allocator tests 3 | * Author: kevin.xie 4 | * Email: kaiyuanxie@yeah.net 5 | * */ 6 | 7 | #include "CUDACachingAllocator.h" 8 | 9 | int main() 10 | { 11 | testDeviceCachingAllocator(); 12 | testDeviceCachingAllocatorE2E(); 13 | testDeviceCachingAllocatorSmallManagement(); 14 | testDeviceCachingAllocatorFragment(); 15 | return 0; 16 | } 17 | -------------------------------------------------------------------------------- /pytorch/torch_ext/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Extension Custom C++/CUDA 2 | 3 | Help you learn how to bind/replace a c++/CUDA function to PyTorch python. 4 | 5 | ## Case 1: easyJIT 6 | 7 | Requirements: 8 | 1. Pytorch (> 1.8 is better) 9 | 2. Ninja 10 | 3. CUDA 11 | 12 | 13 | Run: 14 | ``` 15 | $ cd easyJIT 16 | $ python run.py 17 | ``` 18 | 19 | Common issues: 20 | 1. RuntimeError: Ninja is required to load C++ extensions 21 | 22 | Solution: pip install Ninja 23 | 24 | This example implement a custom c++ function to print tensor array. 25 | The lines of the code is less than 20. It's the first step make you 26 | know the process. The key elements as follows: 27 | 28 | 1. pybind11: binding custom code to python. 29 | 2. #include : includes pytorch defined func/param/kernel. e.g. torch::tensor 30 | 3. from torch.utils.cpp_extension import load: call Ninja JIT to compile the code and import it to python. 31 | 32 | ## Case 2: easySetup 33 | 34 | Run: 35 | ``` 36 | $ cd easySetup 37 | $ python setup install 38 | $ python run.py 39 | ``` 40 | Use setup method does not need to compile code every time. It installs the extension as a 41 | python module. 42 | 43 | ## Case 3: sumArray 44 | 45 | Run: 46 | ``` 47 | $ cd sumArray 48 | $ python run.py 49 | ``` 50 | 51 | This example shows how to use CUDA kernel to accomplish custom sum of a tensor array. 52 | You might find custom one runs faster than torch.sum(). 53 | 54 | Result likes: 55 | 56 | ``` 57 | ... 58 | Loading extension module sum_array... 59 | tensor(24969.7930, device='cuda:0') 60 | tensor(24969.7930, device='cuda:0') 61 | The torch original sum func test: 62 | Elapsed time: 0.07710027694702148 63 | The custom define sum func test: 64 | Elapsed time: 0.06388998031616211 65 | ``` 66 | 67 | ## Case 4: lltmDemo 68 | 69 | Run custom lltm with JIT: 70 | ``` 71 | $ cd lltmDemo 72 | $ python run_custom_lltm.py 73 | ``` 74 | 75 | Run PyTorch API baseline: 76 | ``` 77 | $ python run_baseline.py 78 | ``` 79 | Result e.g.: 80 | ``` 81 | ... 82 | Custom lltm_cuda result: 83 | Forward: min:0.130 ms avg:0.134 ms | Backward min: 0.240 ms avg: 0.246 ms 84 | ... 85 | PyTorch baseline result: 86 | Forward: min:0.121 ms avg:0.142 ms | Backward min: 0.427 ms avg: 0.483 ms 87 | ``` 88 | 89 | ### Chinese Doc 90 | [PyTorch Custom CUDA/C++](https://zhuanlan.zhihu.com/p/579395211) 91 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Step One (Basic examples) 4 | 5 | We connect c++ code with python calling through a lib: pybind11. Before you start, make sure it's installed correctly 6 | 7 | Simple way: 8 | ```shell 9 | pip install pybind11 10 | ``` 11 | ### Case1: How to call function. 12 | Related files in "basics" file: 13 | * “functions.cc". Your c++ function implementation. 14 | * ”function_call.py“. A demo shows call the c++ functions. 15 | 16 | pybind11 provides a way connect c++ with python: 17 | 18 | The bind operations are all in a head file, it does not relay on any other lib. 19 | Using syntax is as follows: 20 | 21 | Include head file and refer namespace: 22 | ```c++ 23 | #include 24 | namespace py = pybind11; 25 | ``` 26 | Create glue func/cls: 27 | ```c++ 28 | PYBIND11_MODULE(functions, m) { 29 | m.doc() = "pybind11 example plugin"; // optional module docstring. Could be printed by python help(). 30 | m.def("add", &add, "A function that adds two numbers"); 31 | } 32 | ``` 33 | parameter explain: 34 | * PYBIND11_MODULE() macro: Create functions and classes for python calling. 35 | * functions: The module created to import in python env. 36 | * m: A variable of type py::module_ which is the main interface for creating bindings. 37 | * .doc : Define module docstring. 38 | * .def : Define a python func for calling. 39 | * parameter-"add": The function name in python, can be changed to any others; 40 | * parameter-&add: The c++ pointer of target function. 41 | * parameter-"...": A description of this function. 42 | 43 | Running the snippet: 44 | 1 Use c++ compiling a python lib. 45 | functions. 46 | ```shell 47 | # Compiler: g++/gcc. 48 | g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) functions.cc -o functions.so 49 | ``` 50 | 51 | 2 To run ”function_call.py“, you'll see info below: 52 | 53 | ``` 54 | Add called, input numbers: i=3 j=4 55 | 7 56 | ``` 57 | 58 | **Note:** 59 | pybind11 python pkg provides binding libs. You can see them in its lib. 60 | ```shell 61 | echo $(python3 -m pybind11 --includes) 62 | ``` 63 | Printing info on console likes: 64 | ``` 65 | -I/home/kaiyuan/anaconda3/envs/py3.9/include/python3.9 -I/home/kaiyuan/anaconda3/envs/py3.9/lib/python3.9/site-packages/pybind11/include 66 | ``` 67 | 68 | ### Case2: How to call objects. 69 | 70 | This example shows c++ class basic calling and how to deal with overload and inheritance. 71 | 72 | Related files in "step_one file": 73 | * “classes.cc". Your c++ classes implementation. 74 | * ”classes_call.py“. A demo shows call the c++ obj. 75 | 76 | 1. Compile to get .so: 77 | ```shell 78 | g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) classes.cc -o classes.so 79 | ``` 80 | 81 | 2. To run ”classes_call.py“, you'll see info below: 82 | 83 | ``` 84 | The profile of this shape: 85 | Desription:Basic example of c++ class 86 | Num:1 87 | Area:10.000000 88 | The profile of this shape: 89 | Desription:Basic example of c++ class 90 | Num:2 91 | Area:20.000000 92 | 100.0 93 | ``` 94 | 95 | ## Advanced Practice 96 | 97 | 98 | ## Building 99 | 100 | Both python setuptools and cmake are available, which deeps on your need. 101 | 102 | There are two good examples on pybind github. 103 | 104 | * python example: https://github.com/pybind/python_example 105 | * cmake example: https://github.com/pybind/cmake_example 106 | 107 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/basics/classes.cc: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | namespace py = pybind11; 6 | 7 | class Shape { 8 | int num; 9 | float area; 10 | 11 | public: 12 | std::string profile; 13 | Shape(int num, float area, const std::string& profile) : num(num), area(area), profile(profile) {} 14 | void setProperty(int num_) { num = num_; } 15 | void setProperty(float area_) { area = area_; } // function overload 16 | float getArea() const { return area; } 17 | std::string getProfile() const { return "Desription:" + profile + "\nNum:" + std::to_string(num) + "\nArea:" + std::to_string(area); } 18 | }; 19 | 20 | class Rectangle : public Shape { 21 | int length; 22 | int width; 23 | public: 24 | Rectangle(int num, int length, int width, const std::string& profile) 25 | : Shape(num, length * width, profile), length(length), width(width) {} 26 | void resetSize(int length, int width) { 27 | this->length = length; 28 | this->width = width; 29 | } 30 | }; 31 | 32 | PYBIND11_MODULE(classes, m) { 33 | py::class_(m, "Shape") 34 | .def(py::init()) 35 | .def("setProperty", static_cast(&Shape::setProperty), "Set the shape property of num.") 36 | .def("setProperty", static_cast(&Shape::setProperty), "Set the shape property of area.") 37 | .def("__repr__", [](Shape& shape) { return "The profile of this shape:\n" + shape.getProfile(); }); 38 | py::class_(m, "Rectangle") 39 | .def(py::init()) 40 | .def("getArea", static_cast(&Rectangle::getArea), "Get the area.") 41 | .def("resetSize", static_cast(&Rectangle::resetSize), "Reset size of rectangle.") 42 | .def_readwrite("profile", &Rectangle::profile); 43 | } 44 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/basics/classes_call.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # The c++ .so pkg is needed before importing. 4 | import classes 5 | 6 | """ 7 | The snippet shows how to invoke a bond class in python, as below: 8 | """ 9 | 10 | # Create obj from c++ class. 11 | shape = classes.Shape(1, 10, "Basic example of c++ class") 12 | print(shape) 13 | 14 | # Change the area while invoke "setProperty" func with 'float' tpye input. 15 | shape.setProperty(20.0) 16 | 17 | # "setProperty" func has been overloaded, thus it can also change the num. 18 | shape.setProperty(2) 19 | 20 | # Then we check the info: 21 | print(shape) 22 | 23 | # It could raise error while call a function not defined in "PYBIND11_MODULE" 24 | # shape.getProfile() 25 | 26 | """ 27 | Inheritance example: 28 | """ 29 | 30 | rect = classes.Rectangle(1, 10, 10, "Inheritance example.") 31 | print(rect.getArea()) 32 | 33 | # Profile attribution could be read&write: 34 | rect.profile = "The profile description has been changed!" 35 | print(rect) 36 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/basics/function_call.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import functions 4 | 5 | print(functions.add(3, 4)) 6 | 7 | # if you want to know info created by pybind using help(): 8 | # help(functions.add) 9 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/basics/functions.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int add(int i = 2, int j = 3) { 5 | std::cout << "Add called, input numbers: i=" << i << " j=" << j << std::endl; 6 | return i + j; 7 | } 8 | 9 | PYBIND11_MODULE(functions, m) { 10 | m.doc() = "pybind11 example plugin"; // optional module docstring. Could be printed by python help(). 11 | m.def("add", &add, "A function that adds two numbers"); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/classes_lib.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_lib.cc -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/classes_lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_lib.h -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/classes_lib_bind.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_lib_bind.cc -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/classes_practice.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_practice.py -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/functions_lib.cc: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | namespace py = pybind11; 6 | 7 | /** 8 | * Function source code 9 | * 10 | * **/ 11 | 12 | int addTwoNum(int a, int b){ 13 | return a + b; 14 | } 15 | 16 | /** 17 | * overload functions 18 | * **/ 19 | 20 | void printInfo(int digis) { 21 | std::cout << "Your input is integer:" << std::to_string(digis) << std::endl; 22 | } 23 | 24 | void printInfo(float digis) { 25 | std::cout << "Your input is string:" << std::to_string(digis) << std::endl; 26 | } 27 | 28 | /** 29 | * inplace case: 30 | **/ 31 | 32 | void inplaceAdd(int& src, int increment) { 33 | src += increment; 34 | } 35 | 36 | struct Data{ 37 | int num=0; 38 | }; 39 | 40 | void inplaceAddV2(Data& data, int increment) { 41 | data.num += increment; 42 | } 43 | 44 | void setDataPtr100(Data* data) { 45 | data->num = 100; 46 | } 47 | 48 | /** 49 | * global variable: 50 | **/ 51 | int worldCount = 9; 52 | 53 | /** 54 | * template function 55 | * **/ 56 | template 57 | T multiply(const T& a, const T& b) { 58 | return a * b; 59 | } 60 | 61 | /** 62 | * Allow/Prohibiting None arguments 63 | * **/ 64 | void showDataNum(Data* data) { 65 | if (data) { 66 | std::cout << "The data.num:" << data->num << std::endl; 67 | return; 68 | } 69 | std::cout << "No data input" << std::endl; 70 | } 71 | 72 | /** 73 | * recall function 74 | * **/ 75 | 76 | typedef int (*FUN)(int); 77 | 78 | int addOne(int a){ 79 | a += 1; 80 | return a; 81 | } 82 | 83 | void recallFunc(FUN f) { 84 | int a = 10; 85 | a = f(a); 86 | } 87 | 88 | 89 | PYBIND11_MODULE(functions, m) { 90 | m.def("add_two_num", &addTwoNum, "Input int a and int b,return a + b"); 91 | m.def("add_two_num_with_default", &addTwoNum, "default a=1, b=2", py::arg("a")=1, py::arg("b")=2); 92 | m.def("printInfo", static_cast(&printInfo), "Overload examples", py::arg("digis")); 93 | m.def("inplace_add", &inplaceAdd, "Expect: input(&a, b), a += b, but it does not work."); 94 | m.def("inplace_add_use_struct", &inplaceAddV2, "data.num += b"); 95 | py::class_(m, "Data") 96 | .def(py::init<>()) 97 | .def_readwrite("num", &Data::num); 98 | m.def("set_data_ptr_100", &setDataPtr100, "data->num= 100"); 99 | m.attr("worldCount")=worldCount; 100 | m.def("multiply", &multiply); 101 | m.def("multiply", &multiply); 102 | m.def("multiply_float", &multiply, py::arg("a").noconvert(), py::arg("b").noconvert()); 103 | m.def("show_data_num", &showDataNum, py::arg("data").none(false)); 104 | m.def("show_data_num_allow_none", &showDataNum, py::arg("data").none(true)); 105 | } 106 | 107 | -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/functions_lib.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/functions_lib.h -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/functions_lib_bind.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/functions_lib_bind.cc -------------------------------------------------------------------------------- /pytorch/torch_ext/binding_examples/bind_practices/functions_practice.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import functions 4 | 5 | def divide_print(info): 6 | print("\n", "-"*40, "\n", "-"*5, info, "\n", "-"*40) 7 | 8 | # Inference will not work 9 | divide_print(" Add case") 10 | print("4+3=", functions.add_two_num(4, 3)) 11 | print("With default args, the result is:", functions.add_two_num_with_default()) 12 | 13 | # Functions overload, the last one works: 14 | divide_print(" overlaod case") 15 | functions.printInfo(10) 16 | 17 | 18 | # Certain basic Python types (like str, int, bool, float, etc.) are immutable. See "Limitations involving reference arguments" 19 | # Inference will not work 20 | divide_print(" Inplace case") 21 | num = 10 22 | print("Before inplace opt, num: ", num) 23 | functions.inplace_add(num, 10) 24 | print("After inplace opt, num: ", num) 25 | 26 | # Sturct data type is OK while using inplace operation 27 | divide_print("Inplace case (corrected):") 28 | data = functions.Data() 29 | data.num = 10 30 | print("Before inplace opt, data.num: ", data.num) 31 | functions.inplace_add_use_struct(data, 4) 32 | print("After inplace opt, data.num: ",data.num) 33 | 34 | # data pointer 35 | divide_print("Function with struct ptr variable in c++ called in python") 36 | data.num = 0 37 | functions.set_data_ptr_100(data) 38 | print(data.num) 39 | 40 | # Call a variable: 41 | # Sturct data type is OK while using inplace operation 42 | divide_print("Global variable:") 43 | print("Print the variable:",functions.worldCount) 44 | functions.worldCount = 2 45 | print("Change the variable:",functions.worldCount) 46 | 47 | # Template: 48 | divide_print("Template: multiply(T, T)") 49 | print("int * int:", functions.multiply(2, 3)) 50 | print("float * float:", functions.multiply(2.0, 3.0)) 51 | 52 | # Explicit args, no convert: 53 | print("float * float (not allow convert):") 54 | 55 | try: 56 | functions.multiply_float(1,3) 57 | except TypeError as e: 58 | print("TypeError Case: \n", e) 59 | 60 | # Allow/Prohibiting None arguments 61 | divide_print("Allow/Prohibiting None arguments") 62 | functions.show_data_num(data) 63 | try: 64 | # Run with None, will raise an error: 65 | functions.show_data_num(None) 66 | except TypeError as e: 67 | print("TypeError Case: \n", e) 68 | functions.show_data_num_allow_none(None) # That's ok. 69 | 70 | # Recall function 71 | 72 | -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_jit/demo.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void printArray(torch::Tensor input) { 4 | int *ptr = (int *)input.data_ptr(); 5 | for(int i=0; i < input.numel(); i++) { 6 | printf("%d\n", ptr[i]); 7 | } 8 | } 9 | 10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 11 | m.def("print_array", &printArray, ""); 12 | } -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_jit/run.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load 3 | 4 | ext_module = load(name="demo", sources=["demo.cu"], verbose=True) 5 | print("Module directory: ", ext_module.__file__) 6 | ext_module.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int)) 7 | -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_load/run_inline_v1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load_inline 3 | 4 | cpp_src = """ 5 | #include 6 | 7 | void printArray(torch::Tensor input) { 8 | int *ptr = (int *)input.data_ptr(); 9 | for(int i=0; i < input.numel(); i++) { 10 | printf("%d\\n", ptr[i]); 11 | } 12 | } 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 15 | m.def("print_array", &printArray, ""); 16 | } 17 | """ 18 | 19 | ext_module = load_inline(name="print_array", cpp_sources=cpp_src, verbose=True) 20 | ext_module.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int)) -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_load/run_inline_v2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load_inline 3 | 4 | cpp_src = """ 5 | #include 6 | 7 | void printArray(torch::Tensor input) { 8 | int *ptr = (int *)input.data_ptr(); 9 | for(int i=0; i < input.numel(); i++) { 10 | printf("%d\\n", ptr[i]); 11 | } 12 | } 13 | 14 | static auto registry = torch::RegisterOperators("new_ops::print_array", &printArray); 15 | """ 16 | 17 | load_inline(name="print_array", cpp_sources=cpp_src, is_python_module=False, verbose=True) 18 | torch.ops.new_ops.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int)) -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_load/run_inline_v3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.cpp_extension import load_inline 3 | 4 | cpp_src = """ 5 | #include 6 | 7 | void printArray(torch::Tensor input) { 8 | int *ptr = (int *)input.data_ptr(); 9 | for(int i = 0; i < input.numel(); i++) { 10 | printf("%d\\n", ptr[i]); 11 | } 12 | } 13 | 14 | void printReverseArray(torch::Tensor input) { 15 | int *ptr = (int *)input.data_ptr(); 16 | for(int i = input.numel()-1; i >= 0; --i) { 17 | printf("%d\\n", ptr[i]); 18 | } 19 | } 20 | 21 | static auto registry = torch::RegisterOperators("new_ops::print_array", &printArray) 22 | .op("new_ops::print_reverse_array", &printReverseArray); 23 | """ 24 | 25 | load_inline(name="print_array", cpp_sources=cpp_src, is_python_module=False, verbose=True) 26 | torch.ops.new_ops.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int)) 27 | print("Reverse:") 28 | torch.ops.new_ops.print_reverse_array(torch.tensor([4, 3, 2, 1], dtype=torch.int)) 29 | -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_setup/my_extension.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void printArray(torch::Tensor input) { 4 | int *ptr = (int *)input.data_ptr(); 5 | for(int i=0; i < input.numel(); i++) { 6 | printf("%d\n", ptr[i]); 7 | } 8 | } 9 | 10 | PYBIND11_MODULE(my_extension, m) { 11 | m.def("print_array", &printArray, ""); 12 | } 13 | -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_setup/run.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import my_extension 3 | 4 | 5 | my_extension.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int)) 6 | -------------------------------------------------------------------------------- /pytorch/torch_ext/easy_setup/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup(name='my_extension', 5 | ext_modules=[CUDAExtension('my_extension', ['my_extension.cpp']),], 6 | cmdclass={'build_ext': BuildExtension}) 7 | -------------------------------------------------------------------------------- /pytorch/torch_ext/lltm_demo/lltm_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* torch cuda custom example */ 2 | #include 3 | 4 | #include 5 | 6 | // CUDA forward declarations 7 | 8 | std::vector lltm_cuda_forward( 9 | torch::Tensor input, 10 | torch::Tensor weights, 11 | torch::Tensor bias, 12 | torch::Tensor old_h, 13 | torch::Tensor old_cell); 14 | 15 | std::vector lltm_cuda_backward( 16 | torch::Tensor grad_h, 17 | torch::Tensor grad_cell, 18 | torch::Tensor new_cell, 19 | torch::Tensor input_gate, 20 | torch::Tensor output_gate, 21 | torch::Tensor candidate_cell, 22 | torch::Tensor X, 23 | torch::Tensor gate_weights, 24 | torch::Tensor weights); 25 | 26 | // C++ interface 27 | 28 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4. 29 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 30 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 31 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 32 | 33 | std::vector lltm_forward( 34 | torch::Tensor input, 35 | torch::Tensor weights, 36 | torch::Tensor bias, 37 | torch::Tensor old_h, 38 | torch::Tensor old_cell) { 39 | CHECK_INPUT(input); 40 | CHECK_INPUT(weights); 41 | CHECK_INPUT(bias); 42 | CHECK_INPUT(old_h); 43 | CHECK_INPUT(old_cell); 44 | 45 | return lltm_cuda_forward(input, weights, bias, old_h, old_cell); 46 | } 47 | 48 | std::vector lltm_backward( 49 | torch::Tensor grad_h, 50 | torch::Tensor grad_cell, 51 | torch::Tensor new_cell, 52 | torch::Tensor input_gate, 53 | torch::Tensor output_gate, 54 | torch::Tensor candidate_cell, 55 | torch::Tensor X, 56 | torch::Tensor gate_weights, 57 | torch::Tensor weights) { 58 | CHECK_INPUT(grad_h); 59 | CHECK_INPUT(grad_cell); 60 | CHECK_INPUT(input_gate); 61 | CHECK_INPUT(output_gate); 62 | CHECK_INPUT(candidate_cell); 63 | CHECK_INPUT(X); 64 | CHECK_INPUT(gate_weights); 65 | CHECK_INPUT(weights); 66 | 67 | return lltm_cuda_backward( 68 | grad_h, 69 | grad_cell, 70 | new_cell, 71 | input_gate, 72 | output_gate, 73 | candidate_cell, 74 | X, 75 | gate_weights, 76 | weights); 77 | } 78 | 79 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 80 | m.def("forward", &lltm_forward, "LLTM forward (CUDA)"); 81 | m.def("backward", &lltm_backward, "LLTM backward (CUDA)"); 82 | } 83 | -------------------------------------------------------------------------------- /pytorch/torch_ext/lltm_demo/lltm_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | /* torch cuda custom example */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace { 11 | template 12 | __device__ __forceinline__ scalar_t sigmoid(scalar_t z) { 13 | return 1.0 / (1.0 + exp(-z)); 14 | } 15 | 16 | template 17 | __device__ __forceinline__ scalar_t d_sigmoid(scalar_t z) { 18 | const auto s = sigmoid(z); 19 | return (1.0 - s) * s; 20 | } 21 | 22 | template 23 | __device__ __forceinline__ scalar_t d_tanh(scalar_t z) { 24 | const auto t = tanh(z); 25 | return 1 - (t * t); 26 | } 27 | 28 | template 29 | __device__ __forceinline__ scalar_t elu(scalar_t z, scalar_t alpha = 1.0) { 30 | return fmaxf(0.0, z) + fminf(0.0, alpha * (exp(z) - 1.0)); 31 | } 32 | 33 | template 34 | __device__ __forceinline__ scalar_t d_elu(scalar_t z, scalar_t alpha = 1.0) { 35 | const auto e = exp(z); 36 | const auto d_relu = z < 0.0 ? 0.0 : 1.0; 37 | return d_relu + (((alpha * (e - 1.0)) < 0.0) ? (alpha * e) : 0.0); 38 | } 39 | 40 | template 41 | __global__ void lltm_cuda_forward_kernel( 42 | const torch::PackedTensorAccessor gates, 43 | const torch::PackedTensorAccessor old_cell, 44 | torch::PackedTensorAccessor new_h, 45 | torch::PackedTensorAccessor new_cell, 46 | torch::PackedTensorAccessor input_gate, 47 | torch::PackedTensorAccessor output_gate, 48 | torch::PackedTensorAccessor candidate_cell) { 49 | //batch index 50 | const int n = blockIdx.y; 51 | // column index 52 | const int c = blockIdx.x * blockDim.x + threadIdx.x; 53 | if (c < gates.size(2)){ 54 | input_gate[n][c] = sigmoid(gates[n][0][c]); 55 | output_gate[n][c] = sigmoid(gates[n][1][c]); 56 | candidate_cell[n][c] = elu(gates[n][2][c]); 57 | new_cell[n][c] = 58 | old_cell[n][c] + candidate_cell[n][c] * input_gate[n][c]; 59 | new_h[n][c] = tanh(new_cell[n][c]) * output_gate[n][c]; 60 | } 61 | } 62 | 63 | template 64 | __global__ void lltm_cuda_backward_kernel( 65 | torch::PackedTensorAccessor d_old_cell, 66 | torch::PackedTensorAccessor d_gates, 67 | const torch::PackedTensorAccessor grad_h, 68 | const torch::PackedTensorAccessor grad_cell, 69 | const torch::PackedTensorAccessor new_cell, 70 | const torch::PackedTensorAccessor input_gate, 71 | const torch::PackedTensorAccessor output_gate, 72 | const torch::PackedTensorAccessor candidate_cell, 73 | const torch::PackedTensorAccessor gate_weights) { 74 | //batch index 75 | const int n = blockIdx.y; 76 | // column index 77 | const int c = blockIdx.x * blockDim.x + threadIdx.x; 78 | if (c < d_gates.size(2)){ 79 | const auto d_output_gate = tanh(new_cell[n][c]) * grad_h[n][c]; 80 | const auto d_tanh_new_cell = output_gate[n][c] * grad_h[n][c]; 81 | const auto d_new_cell = 82 | d_tanh(new_cell[n][c]) * d_tanh_new_cell + grad_cell[n][c]; 83 | 84 | 85 | d_old_cell[n][c] = d_new_cell; 86 | const auto d_candidate_cell = input_gate[n][c] * d_new_cell; 87 | const auto d_input_gate = candidate_cell[n][c] * d_new_cell; 88 | 89 | d_gates[n][0][c] = 90 | d_input_gate * d_sigmoid(gate_weights[n][0][c]); 91 | d_gates[n][1][c] = 92 | d_output_gate * d_sigmoid(gate_weights[n][1][c]); 93 | d_gates[n][2][c] = 94 | d_candidate_cell * d_elu(gate_weights[n][2][c]); 95 | } 96 | } 97 | } // namespace 98 | 99 | std::vector lltm_cuda_forward( 100 | torch::Tensor input, 101 | torch::Tensor weights, 102 | torch::Tensor bias, 103 | torch::Tensor old_h, 104 | torch::Tensor old_cell) { 105 | auto X = torch::cat({old_h, input}, /*dim=*/1); 106 | auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); 107 | 108 | const auto batch_size = old_cell.size(0); 109 | const auto state_size = old_cell.size(1); 110 | 111 | auto gates = gate_weights.reshape({batch_size, 3, state_size}); 112 | auto new_h = torch::zeros_like(old_cell); 113 | auto new_cell = torch::zeros_like(old_cell); 114 | auto input_gate = torch::zeros_like(old_cell); 115 | auto output_gate = torch::zeros_like(old_cell); 116 | auto candidate_cell = torch::zeros_like(old_cell); 117 | 118 | const int threads = 1024; 119 | const dim3 blocks((state_size + threads - 1) / threads, batch_size); 120 | 121 | AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] { 122 | lltm_cuda_forward_kernel<<>>( 123 | gates.packed_accessor(), 124 | old_cell.packed_accessor(), 125 | new_h.packed_accessor(), 126 | new_cell.packed_accessor(), 127 | input_gate.packed_accessor(), 128 | output_gate.packed_accessor(), 129 | candidate_cell.packed_accessor()); 130 | })); 131 | 132 | return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates}; 133 | } 134 | 135 | std::vector lltm_cuda_backward( 136 | torch::Tensor grad_h, 137 | torch::Tensor grad_cell, 138 | torch::Tensor new_cell, 139 | torch::Tensor input_gate, 140 | torch::Tensor output_gate, 141 | torch::Tensor candidate_cell, 142 | torch::Tensor X, 143 | torch::Tensor gates, 144 | torch::Tensor weights) { 145 | auto d_old_cell = torch::zeros_like(new_cell); 146 | auto d_gates = torch::zeros_like(gates); 147 | 148 | const auto batch_size = new_cell.size(0); 149 | const auto state_size = new_cell.size(1); 150 | 151 | const int threads = 1024; 152 | const dim3 blocks((state_size + threads - 1) / threads, batch_size); 153 | 154 | AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] { 155 | lltm_cuda_backward_kernel<<>>( 156 | d_old_cell.packed_accessor(), 157 | d_gates.packed_accessor(), 158 | grad_h.packed_accessor(), 159 | grad_cell.packed_accessor(), 160 | new_cell.packed_accessor(), 161 | input_gate.packed_accessor(), 162 | output_gate.packed_accessor(), 163 | candidate_cell.packed_accessor(), 164 | gates.packed_accessor()); 165 | })); 166 | 167 | auto d_gate_weights = d_gates.flatten(1, 2); 168 | auto d_weights = d_gate_weights.t().mm(X); 169 | auto d_bias = d_gate_weights.sum(/*dim=*/0, /*keepdim=*/true); 170 | 171 | auto d_X = d_gate_weights.mm(weights); 172 | auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); 173 | auto d_input = d_X.slice(/*dim=*/1, state_size); 174 | 175 | return {d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates}; 176 | } -------------------------------------------------------------------------------- /pytorch/torch_ext/lltm_demo/run_baseline.py: -------------------------------------------------------------------------------- 1 | # torch cuda custom example # 2 | 3 | import math 4 | import time 5 | 6 | from torch import nn 7 | from torch.autograd import Function 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def d_sigmoid(z): 13 | s = torch.sigmoid(z) 14 | return (1 - s) * s 15 | 16 | 17 | def d_tanh(z): 18 | t = torch.tanh(z) 19 | return 1 - (t * t) 20 | 21 | 22 | def d_elu(z, alpha=1.0): 23 | e = z.exp() 24 | mask = (alpha * (e - 1)) < 0 25 | return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e) 26 | 27 | 28 | class LLTMFunction(Function): 29 | @staticmethod 30 | def forward(ctx, input, weights, bias, old_h, old_cell): 31 | X = torch.cat([old_h, input], dim=1) 32 | 33 | gate_weights = F.linear(X, weights, bias) 34 | gates = gate_weights.chunk(3, dim=1) 35 | 36 | input_gate = torch.sigmoid(gates[0]) 37 | output_gate = torch.sigmoid(gates[1]) 38 | candidate_cell = F.elu(gates[2]) 39 | 40 | new_cell = old_cell + candidate_cell * input_gate 41 | new_h = torch.tanh(new_cell) * output_gate 42 | 43 | ctx.save_for_backward(X, weights, input_gate, output_gate, old_cell, 44 | new_cell, candidate_cell, gate_weights) 45 | 46 | return new_h, new_cell 47 | 48 | @staticmethod 49 | def backward(ctx, grad_h, grad_cell): 50 | X, weights, input_gate, output_gate, old_cell = ctx.saved_variables[:5] 51 | new_cell, candidate_cell, gate_weights = ctx.saved_variables[5:] 52 | 53 | d_input = d_weights = d_bias = d_old_h = d_old_cell = None 54 | 55 | d_output_gate = torch.tanh(new_cell) * grad_h 56 | d_tanh_new_cell = output_gate * grad_h 57 | d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell 58 | 59 | d_old_cell = d_new_cell 60 | d_candidate_cell = input_gate * d_new_cell 61 | d_input_gate = candidate_cell * d_new_cell 62 | 63 | gates = gate_weights.chunk(3, dim=1) 64 | d_input_gate *= d_sigmoid(gates[0]) 65 | d_output_gate *= d_sigmoid(gates[1]) 66 | d_candidate_cell *= d_elu(gates[2]) 67 | 68 | d_gates = torch.cat( 69 | [d_input_gate, d_output_gate, d_candidate_cell], dim=1) 70 | 71 | if ctx.needs_input_grad[1]: 72 | d_weights = d_gates.t().mm(X) 73 | if ctx.needs_input_grad[2]: 74 | d_bias = d_gates.sum(dim=0, keepdim=True) 75 | if ctx.needs_input_grad[3] or ctx.needs_input_grad[4]: 76 | d_X = d_gates.mm(weights) 77 | state_size = grad_h.shape[1] 78 | d_old_h, d_input = d_X[:, :state_size], d_X[:, state_size:] 79 | 80 | return d_input, d_weights, d_bias, d_old_h, d_old_cell 81 | 82 | 83 | class LLTM(nn.Module): 84 | def __init__(self, input_features, state_size): 85 | super(LLTM, self).__init__() 86 | self.input_features = input_features 87 | self.state_size = state_size 88 | self.weights = nn.Parameter( 89 | torch.Tensor(3 * state_size, input_features + state_size)) 90 | self.bias = nn.Parameter(torch.Tensor(1, 3 * state_size)) 91 | self.reset_parameters() 92 | 93 | def reset_parameters(self): 94 | stdv = 1.0 / math.sqrt(self.state_size) 95 | for weight in self.parameters(): 96 | weight.data.uniform_(-stdv, +stdv) 97 | 98 | def forward(self, input, state): 99 | return LLTMFunction.apply(input, self.weights, self.bias, *state) 100 | 101 | 102 | if __name__ == "__main__": 103 | torch.manual_seed(42) 104 | device = torch.device("cuda") 105 | dtype = torch.float32 106 | kwargs = {'dtype': dtype, 107 | 'device': device, 108 | 'requires_grad': True} 109 | batch_size = 32 110 | features = 32 111 | state_size = 256 112 | iter_nums = 100 113 | 114 | X = torch.randn(batch_size, features, **kwargs) 115 | h = torch.randn(batch_size, state_size, **kwargs) 116 | C = torch.randn(batch_size, state_size, **kwargs) 117 | rnn = LLTM(features, state_size).to(device, dtype) 118 | # Force CUDA initialization 119 | new_h, new_C = rnn(X, (h, C)) 120 | (new_h.sum() + new_C.sum()).backward() 121 | 122 | forward_min = math.inf 123 | forward_time = 0 124 | backward_min = math.inf 125 | backward_time = 0 126 | 127 | for _ in range(iter_nums): 128 | rnn.zero_grad() 129 | start = time.time() 130 | new_h, new_C = rnn(X, (h, C)) 131 | elapsed = time.time() - start 132 | forward_min = min(forward_min, elapsed) 133 | forward_time += elapsed 134 | 135 | start = time.time() 136 | (new_h.sum() + new_C.sum()).backward() 137 | elapsed = time.time() - start 138 | backward_min = min(backward_min, elapsed) 139 | backward_time += elapsed 140 | 141 | forward_min *= 1000 142 | backward_min *= 1000 143 | forward_average = forward_time / iter_nums * 1000 144 | backward_average = backward_time / iter_nums * 1000 145 | 146 | print("PyTorch baseline result:") 147 | print('Forward: min:{0:.3f} ms avg:{1:.3f} ms | Backward min: {2:.3f} ' 148 | 'ms avg: {3:.3f} ms'.format(forward_min, forward_average, 149 | backward_min, backward_average,)) 150 | -------------------------------------------------------------------------------- /pytorch/torch_ext/lltm_demo/run_custom_lltm.py: -------------------------------------------------------------------------------- 1 | # torch cuda custom example # 2 | 3 | import math 4 | from torch import nn 5 | from torch.autograd import Function 6 | import torch 7 | import time 8 | try: 9 | import lltm_cuda 10 | except ImportError as e: 11 | print("lltm_cuda.so is not found! Use JIT compiling....") 12 | from torch.utils.cpp_extension import load 13 | lltm_cuda = load( 14 | 'lltm_cuda', ['lltm_cuda.cpp', 'lltm_cuda_kernel.cu']) # verbose=True 15 | print("lltm_cuda dir:", lltm_cuda.__file__) 16 | 17 | 18 | class LLTMFunction(Function): 19 | @staticmethod 20 | def forward(ctx, input, weights, bias, old_h, old_cell): 21 | outputs = lltm_cuda.forward(input, weights, bias, old_h, old_cell) 22 | new_h, new_cell = outputs[:2] 23 | variables = outputs[1:] + [weights] 24 | ctx.save_for_backward(*variables) 25 | 26 | return new_h, new_cell 27 | 28 | @staticmethod 29 | def backward(ctx, grad_h, grad_cell): 30 | outputs = lltm_cuda.backward( 31 | grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors) 32 | d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates = outputs 33 | return d_input, d_weights, d_bias, d_old_h, d_old_cell 34 | 35 | 36 | class LLTM(nn.Module): 37 | def __init__(self, input_features, state_size): 38 | super(LLTM, self).__init__() 39 | self.input_features = input_features 40 | self.state_size = state_size 41 | self.weights = nn.Parameter( 42 | torch.Tensor(3 * state_size, input_features + state_size)) 43 | self.bias = nn.Parameter(torch.Tensor(1, 3 * state_size)) 44 | self.reset_parameters() 45 | 46 | def reset_parameters(self): 47 | stdv = 1.0 / math.sqrt(self.state_size) 48 | for weight in self.parameters(): 49 | weight.data.uniform_(-stdv, +stdv) 50 | 51 | def forward(self, input, state): 52 | return LLTMFunction.apply(input, self.weights, self.bias, *state) 53 | 54 | 55 | if __name__ == "__main__": 56 | torch.manual_seed(42) 57 | device = torch.device("cuda") 58 | dtype = torch.float32 59 | kwargs = {'dtype': dtype, 60 | 'device': device, 61 | 'requires_grad': True} 62 | batch_size = 32 63 | features = 32 64 | state_size = 256 65 | iter_nums = 100 66 | 67 | X = torch.randn(batch_size, features, **kwargs) 68 | h = torch.randn(batch_size, state_size, **kwargs) 69 | C = torch.randn(batch_size, state_size, **kwargs) 70 | rnn = LLTM(features, state_size).to(device, dtype) 71 | # Force CUDA initialization 72 | new_h, new_C = rnn(X, (h, C)) 73 | (new_h.sum() + new_C.sum()).backward() 74 | 75 | forward_min = math.inf 76 | forward_time = 0 77 | backward_min = math.inf 78 | backward_time = 0 79 | 80 | for _ in range(iter_nums): 81 | rnn.zero_grad() 82 | start = time.time() 83 | new_h, new_C = rnn(X, (h, C)) 84 | elapsed = time.time() - start 85 | forward_min = min(forward_min, elapsed) 86 | forward_time += elapsed 87 | 88 | start = time.time() 89 | (new_h.sum() + new_C.sum()).backward() 90 | elapsed = time.time() - start 91 | backward_min = min(backward_min, elapsed) 92 | backward_time += elapsed 93 | 94 | forward_min *= 1000 95 | backward_min *= 1000 96 | forward_average = forward_time / iter_nums * 1000 97 | backward_average = backward_time / iter_nums * 1000 98 | 99 | print("Custom lltm_cuda result: ") 100 | print('Forward: min:{0:.3f} ms avg:{1:.3f} ms | Backward min: {2:.3f} ' 101 | 'ms avg: {3:.3f} ms'.format(forward_min, forward_average, 102 | backward_min, backward_average,)) 103 | -------------------------------------------------------------------------------- /pytorch/torch_ext/lltm_demo/setup.py: -------------------------------------------------------------------------------- 1 | # torch cuda custom example # 2 | 3 | from setuptools import setup 4 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 5 | 6 | setup( 7 | name='lltm_cuda', 8 | ext_modules=[ 9 | CUDAExtension('lltm_cuda', [ 10 | 'lltm_cuda.cpp', 11 | 'lltm_cuda_kernel.cu', 12 | ]), 13 | ], 14 | cmdclass={ 15 | 'build_ext': BuildExtension 16 | }) -------------------------------------------------------------------------------- /pytorch/torch_ext/sum_array/glueCode.cpp: -------------------------------------------------------------------------------- 1 | #include "sumArray.h" 2 | #include 3 | 4 | 5 | torch::Tensor torchSumArray(torch::Tensor input) { 6 | int dataSize = input.numel(); 7 | float* devInData = (float *)input.data_ptr(); 8 | arraySumCUDA(devInData, dataSize); 9 | return input[0]; 10 | } 11 | 12 | PYBIND11_MODULE(sum_array, m) { 13 | m.def("sum_array", &torchSumArray, ""); 14 | } -------------------------------------------------------------------------------- /pytorch/torch_ext/sum_array/run.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | from torch.utils.cpp_extension import load 4 | 5 | ext_module = load(name="sum_array", 6 | extra_include_paths=["./"] , 7 | sources=["sumArray.cu", "glueCode.cpp"], 8 | verbose=True) 9 | 10 | 11 | def iter_test(func): 12 | delta_t = 0 13 | for _ in range(10000): 14 | _tensor = torch.rand(50000, dtype=torch.float, device='cuda') 15 | t1 = time.time() 16 | func(_tensor) 17 | t2 = time.time() 18 | delta_t += t2-t1 19 | print(" Elapsed time:", delta_t) 20 | return delta_t 21 | 22 | 23 | if __name__ == "__main__": 24 | # warm up: 25 | in_tensor = torch.rand(50000, dtype=torch.float, device='cuda') 26 | print(torch.sum(in_tensor)) 27 | print(ext_module.sum_array(in_tensor.clone())) 28 | 29 | # time test: 30 | print("The torch original sum func test:") 31 | iter_test(torch.sum) 32 | print("The custom define sum func test:") 33 | iter_test(ext_module.sum_array) 34 | -------------------------------------------------------------------------------- /pytorch/torch_ext/sum_array/sumArray.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * PyTorch extension cuda example: sum array. 3 | * Author: kevin.xie 4 | * Email: kaiyuanxie@yeah.net 5 | * */ 6 | #include 7 | #include "sumArray.h" 8 | 9 | 10 | __device__ int countSHM = 0; 11 | __global__ void arraySumWithSHMKernel(float *arrData, const int dataSize) 12 | { 13 | __shared__ float shm[THREAD_PER_BLOCK]; 14 | int thIdx = threadIdx.x + blockIdx.x * blockDim.x; 15 | if (thIdx == 0) { 16 | countSHM = 0; 17 | __threadfence(); 18 | } 19 | float val = 0.0; 20 | while (thIdx < dataSize) { 21 | val += arrData[thIdx]; 22 | thIdx += blockDim.x * gridDim.x; 23 | } 24 | shm[threadIdx.x] = val; 25 | __syncthreads(); 26 | 27 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 28 | if (threadIdx.x < i) 29 | shm[threadIdx.x] += shm[threadIdx.x + i]; 30 | __syncthreads(); 31 | } 32 | 33 | __syncthreads(); 34 | bool isLast = false; 35 | thIdx = threadIdx.x + blockIdx.x * blockDim.x; 36 | if (threadIdx.x == 0) { 37 | arrData[blockIdx.x] = shm[0]; 38 | __threadfence(); 39 | int value = atomicAdd(&countSHM, 1); 40 | isLast = (value == gridDim.x - 1); 41 | } 42 | isLast = __syncthreads_or(isLast); 43 | if (isLast) { 44 | shm[threadIdx.x] = threadIdx.x < gridDim.x ? arrData[threadIdx.x] : 0; 45 | __syncthreads(); 46 | for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) { 47 | if (threadIdx.x < i) 48 | shm[threadIdx.x] += shm[threadIdx.x + i]; 49 | __syncthreads(); 50 | } 51 | __syncthreads(); 52 | if (threadIdx.x == 0) 53 | arrData[0] = shm[0]; 54 | } 55 | __syncthreads(); 56 | } 57 | 58 | void arraySumCUDA(float *arrData, const int dataSize) { 59 | int grid = max(dataSize / THREAD_PER_BLOCK, 1); 60 | arraySumWithSHMKernel<<>>(arrData, dataSize); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /pytorch/torch_ext/sum_array/sumArray.h: -------------------------------------------------------------------------------- 1 | /** 2 | * PyTorch extension cuda example: sum array. 3 | * Author: kevin.xie 4 | * Email: kaiyuanxie@yeah.net 5 | * */ 6 | 7 | #pragma once 8 | 9 | // CUDA runtime 10 | #include 11 | #define THREAD_PER_BLOCK 256 12 | 13 | 14 | void arraySumCUDA(float *, const int); -------------------------------------------------------------------------------- /pytorch/torch_mem_snapshot/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch memory snapshot examples 2 | 3 | ## API ref: 4 | 5 | * https://pytorch.org/docs/main/torch_cuda_memory.html#understanding-cuda-memory-usage 6 | * https://pytorch.org/docs/main/profiler.html 7 | 8 | ## To visualize the graph 9 | 10 | drag the pickle file to: “https://pytorch.org/memory_viz” -------------------------------------------------------------------------------- /pytorch/torch_mem_snapshot/block_fragment.py: -------------------------------------------------------------------------------- 1 | # Author: kevin.xie zhihu@kaiyuan 2 | 3 | import torch 4 | from datetime import datetime 5 | 6 | 7 | def segment_example(device="cuda:0"): 8 | tensor1 = torch.randn(size=(10,1024, 1024, 512), device=device) 9 | tensor1.to("cpu") 10 | # free tensor1 ,the segment will be freed as well. 11 | del tensor1 12 | torch.cuda.empty_cache() 13 | # create a new segment and a new block for tensor2 14 | tensor2 = torch.rand(size=(1, 1024, 512), device=device) 15 | tensor3 = torch.rand(size=(12, 1024, 512), device=device) 16 | 17 | 18 | def run(): 19 | # Start recording memory snapshot history 20 | torch.cuda.memory._record_memory_history(max_entries=100000) 21 | 22 | # example running: 23 | segment_example() 24 | 25 | timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') 26 | file_name = f"visual_mem_{timestamp}.pickle" 27 | # save record: 28 | torch.cuda.memory._dump_snapshot(file_name) 29 | 30 | # Stop recording memory snapshot history: 31 | torch.cuda.memory._record_memory_history(enabled=None) 32 | 33 | 34 | if __name__ == "__main__": 35 | run() 36 | -------------------------------------------------------------------------------- /pytorch/torch_mem_snapshot/segment.py: -------------------------------------------------------------------------------- 1 | # Author: kevin.xie zhihu@kaiyuan 2 | 3 | import torch 4 | from datetime import datetime 5 | 6 | 7 | def segment_example(device="cuda:0"): 8 | tensor1 = torch.randn(size=(10,1024, 1024, 512), device=device) 9 | tensor1.to("cpu") 10 | # free tensor1 ,the segment will be freed as well. 11 | del tensor1 12 | torch.cuda.empty_cache() 13 | # create a new segment and a new block for tensor2 14 | tensor2 = torch.rand(size=(1, 1024, 512), device=device) 15 | 16 | tensor_group = [] 17 | for _ in range(10): 18 | tensor_group.append(torch.rand(size=(1024, 1024, 512), device=device)) 19 | 20 | 21 | def run(): 22 | # Start recording memory snapshot history 23 | torch.cuda.memory._record_memory_history(max_entries=100000) 24 | 25 | # example running: 26 | segment_example() 27 | 28 | timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') 29 | file_name = f"visual_mem_{timestamp}.pickle" 30 | # save record: 31 | torch.cuda.memory._dump_snapshot(file_name) 32 | 33 | # Stop recording memory snapshot history: 34 | torch.cuda.memory._record_memory_history(enabled=None) 35 | 36 | 37 | if __name__ == "__main__": 38 | run() 39 | 40 | -------------------------------------------------------------------------------- /pytorch/torch_mem_snapshot/transformer_profile.py: -------------------------------------------------------------------------------- 1 | # Author: kevin.xie zhihu@kaiyuan 2 | 3 | import torch 4 | from torch import nn 5 | from datetime import datetime 6 | from torch.autograd.profiler import record_function 7 | 8 | 9 | def trace_handler(prof: torch.profiler.profile): 10 | # Prefix for file names. 11 | timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') 12 | file_name = f"visual_mem_{timestamp}.pickle" 13 | 14 | # Construct the trace file. 15 | prof.export_chrome_trace(f"{file_name}.json.gz") 16 | 17 | # Construct the memory timeline file. 18 | prof.export_memory_timeline(f"{file_name}.html", device="cuda:0") 19 | 20 | 21 | def train(num_iter=5, device="cuda:0"): 22 | model = nn.Transformer(d_model=512, nhead=2, num_encoder_layers=2, num_decoder_layers=2).to(device=device) 23 | x = torch.randn(size=(1, 1024, 512), device=device) 24 | tgt = torch.rand(size=(1, 1024, 512), device=device) 25 | model.train() 26 | labels = torch.rand_like(model(x, tgt)) 27 | criterion = torch.nn.CrossEntropyLoss() 28 | optimizer = torch.optim.Adam(model.parameters()) 29 | with torch.profiler.profile( 30 | activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], 31 | schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1), 32 | record_shapes=True, 33 | profile_memory=True, 34 | with_stack=True, 35 | on_trace_ready=trace_handler, 36 | ) as prof: 37 | for _ in range(num_iter): 38 | prof.step() 39 | with record_function("## forward ##"): 40 | y = model(x, tgt) 41 | 42 | with record_function("## backward ##"): 43 | loss = criterion(y, labels) 44 | loss.backward() 45 | print(loss.item()) 46 | 47 | with record_function("## optimizer ##"): 48 | optimizer.step() 49 | optimizer.zero_grad(set_to_none=True) 50 | 51 | 52 | if __name__ == "__main__": 53 | # warm-up: 54 | train(1) 55 | # run: 56 | train(3) 57 | 58 | -------------------------------------------------------------------------------- /pytorch/torch_mem_snapshot/transformer_snapshot.py: -------------------------------------------------------------------------------- 1 | # Author: kevin.xie zhihu@kaiyuan 2 | 3 | import torch 4 | from torch import nn 5 | from datetime import datetime 6 | 7 | 8 | def train(num_iter=5, device="cuda:0"): 9 | model = nn.Transformer(d_model=512, nhead=2, num_encoder_layers=2, num_decoder_layers=2).to(device=device) 10 | x = torch.randn(size=(1, 1024, 512), device=device) 11 | tgt = torch.rand(size=(1, 1024, 512), device=device) 12 | model.train() 13 | labels = torch.rand_like(model(x, tgt)) 14 | criterion = torch.nn.CrossEntropyLoss() 15 | optimizer = torch.optim.Adam(model.parameters()) 16 | for _ in range(num_iter): 17 | y = model(x, tgt) 18 | loss = criterion(y, labels) 19 | loss.backward() 20 | print(loss.item()) 21 | optimizer.step() 22 | optimizer.zero_grad(set_to_none=True) 23 | 24 | 25 | def run(): 26 | # Start recording memory snapshot history 27 | torch.cuda.memory._record_memory_history(max_entries=100000) 28 | 29 | # training running: 30 | train() 31 | 32 | timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') 33 | file_name = f"visual_mem_{timestamp}.pickle" 34 | # save record: 35 | torch.cuda.memory._dump_snapshot(file_name) 36 | 37 | # Stop recording memory snapshot history: 38 | torch.cuda.memory._record_memory_history(enabled=None) 39 | 40 | 41 | if __name__ == "__main__": 42 | run() 43 | 44 | -------------------------------------------------------------------------------- /transformer/fused_softmax/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Fused softmax 3 | 4 | ## Formula: 5 | 6 | The fuesd opts including: 7 | * opt1: softmax 8 | * opt2: scale 9 | * opt3: mask 10 | 11 | **1 Softmax** 12 | 13 | * forward: yi=e^{xi- max(X)}/\sum_{j=1}^{n}{e^{xj- max(X)}} 14 | * backward: dxi = yi * d yi - yi * \sum_{j=1}^{n}{yj * dyj} 15 | 16 | **2 Scale** 17 | 18 | output = input * scale 19 | 20 | **3 Mask** 21 | ```textmate 22 | if(mask[i] == 1) 23 | then 24 | val[i] = -VAL 25 | else 26 | do_something 27 | ``` 28 | 29 | input data shape: 30 | 31 | [batches, attn_heads, query_seq_len, key_seq_len] 32 | 33 | 34 | 35 | ## Requirements 36 | 37 | pytorch>=2.0 38 | 39 | cuda>=11.3 40 | 41 | hardware: GPU >= volta 42 | 43 | ## compile 44 | 45 | ``` 46 | python setup.py build 47 | ``` 48 | 49 | ## Running 50 | 51 | ### Function invoke: 52 | ```python 53 | import transformer_softmax_lib 54 | # ... 55 | transformer_softmax_lib.scaled_masked_softmax_forward(input_data, mask, scale_factor) 56 | ``` 57 | 58 | ### A test example: 59 | 60 | note:make sure the .so file is in your running dircetion: 61 | 62 | ```python 63 | import torch 64 | import transformer_softmax_lib 65 | from torch.autograd import Function 66 | 67 | class FusedSoftmax(Function): 68 | @staticmethod 69 | def forward(ctx, src, mask, scale_factor): 70 | 71 | output = transformer_softmax_lib.scaled_masked_softmax_forward(src, mask, scale_factor[0]) 72 | ctx.save_for_backward(output , scale_factor) 73 | return output 74 | 75 | @staticmethod 76 | def backward(ctx, grad_output): 77 | src, scale_factor = ctx.saved_tensors 78 | grad_in = transformer_softmax_lib.scaled_masked_softmax_backward(grad_output, src, scale_factor[0]) 79 | return grad_in, None, None # 与输入对应上。 80 | 81 | data_input = torch.randn([1,8,1024,1024], dtype=torch.float16, device='cuda', requires_grad=True) 82 | data_input_check = data_input.clone().detach() 83 | data_input_check.requires_grad_(True) 84 | factor = torch.tensor([1.0], requires_grad=False) 85 | mask = torch.zeros([1,1,1024,1024], dtype=torch.float16, device='cuda', requires_grad=False) 86 | check = torch.softmax(data_input_check, dim=-1) 87 | out_put = FusedSoftmax.apply(data_input, mask, factor) 88 | 89 | # forward check: 90 | print(torch.allclose(check, out_put, atol=1e-05, rtol=1e-05 )) # fp16 91 | 92 | # backward check: 93 | with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=False): 94 | y=out_put.sum().backward() 95 | y_check=check.sum().backward() 96 | print(torch.allclose(data_input.grad, data_input_check.grad, atol=1e-05, rtol=1e-05 )) 97 | ``` 98 | -------------------------------------------------------------------------------- /transformer/fused_softmax/scaled_masked_softmax.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "scaled_masked_softmax.h" 24 | #include "utils.h" 25 | 26 | namespace fused_softmax { 27 | namespace scaled_masked_softmax { 28 | 29 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){ 30 | return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads); 31 | } 32 | 33 | 34 | torch::Tensor fwd_cuda( 35 | torch::Tensor const& input, 36 | torch::Tensor const& mask, 37 | float scale_factor) 38 | { 39 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 40 | const int batches = input.size(0); 41 | const int pad_batches = mask.size(0); 42 | const int attn_heads = input.size(1); 43 | const int query_seq_len = input.size(2); 44 | const int key_seq_len = input.size(3); 45 | TORCH_INTERNAL_ASSERT(key_seq_len <= 8192); 46 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 47 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 48 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 49 | TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len); 50 | TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len); 51 | 52 | // Output 53 | auto act_options = input.options().requires_grad(false); 54 | torch::Tensor softmax_results = 55 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 56 | 57 | // Softmax Intermediate Result Ptr 58 | void* input_ptr = static_cast(input.data_ptr()); 59 | void* mask_ptr = static_cast(mask.data_ptr()); 60 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 61 | 62 | DISPATCH_HALF_AND_BFLOAT( 63 | input.scalar_type(), 64 | "dispatch_scaled_masked_softmax_forward", 65 | dispatch_scaled_masked_softmax_forward( 66 | reinterpret_cast(softmax_results_ptr), 67 | reinterpret_cast(input_ptr), 68 | reinterpret_cast(mask_ptr), 69 | scale_factor, 70 | query_seq_len, 71 | key_seq_len, 72 | batches, 73 | attn_heads, 74 | pad_batches 75 | ); 76 | ); 77 | return softmax_results; 78 | } 79 | 80 | torch::Tensor bwd_cuda( 81 | torch::Tensor const& output_grads_, 82 | torch::Tensor const& softmax_results_, 83 | float scale_factor) { 84 | 85 | auto output_grads = output_grads_.contiguous(); 86 | auto softmax_results = softmax_results_.contiguous(); 87 | 88 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 89 | const int batches = output_grads.size(0); 90 | const int attn_heads = output_grads.size(1); 91 | const int query_seq_len = output_grads.size(2); 92 | const int key_seq_len = output_grads.size(3); 93 | 94 | auto act_options = output_grads.options().requires_grad(false); 95 | torch::Tensor input_grads = 96 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 97 | void* input_grads_ptr = static_cast(input_grads.data_ptr()); 98 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 99 | 100 | //Softmax Grad 101 | DISPATCH_HALF_AND_BFLOAT( 102 | output_grads_.scalar_type(), 103 | "dispatch_scaled_masked_softmax_backward", 104 | dispatch_scaled_masked_softmax_backward( 105 | reinterpret_cast(input_grads_ptr), 106 | reinterpret_cast(output_grads_ptr), 107 | reinterpret_cast(softmax_results.data_ptr()), 108 | scale_factor, 109 | query_seq_len, 110 | key_seq_len, 111 | batches, 112 | attn_heads 113 | ); 114 | ); 115 | return input_grads; 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /transformer/fused_softmax/setup.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/NVIDIA/apex/tree/master/csrc/megatron 2 | # create a baseline 3 | import os 4 | import subprocess 5 | 6 | from setuptools import setup 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME 8 | 9 | # print("enter setup") 10 | 11 | def get_cuda_bare_metal_version(cuda_dir): 12 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) 13 | output = raw_output.split() 14 | release_idx = output.index("release") + 1 15 | release = output[release_idx].split(".") 16 | bare_metal_major = release[0] 17 | bare_metal_minor = release[1][0] 18 | 19 | return raw_output, bare_metal_major, bare_metal_minor 20 | 21 | 22 | def append_nvcc_threads(nvcc_extra_args): 23 | _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME) 24 | if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2: 25 | return nvcc_extra_args + ["--threads", "4"] 26 | return nvcc_extra_args 27 | 28 | 29 | cc_flag = [] 30 | # Support Volta: 31 | cc_flag.append("-gencode") 32 | cc_flag.append("arch=compute_70,code=sm_70") 33 | # Support Ampere: 34 | cc_flag.append("-gencode") 35 | cc_flag.append("arch=compute_80,code=sm_80") 36 | # Support Hopper: 37 | # cc_flag.append("-gencode") 38 | # cc_flag.append("arch=compute_90,code=sm_90") 39 | 40 | setup( 41 | name='transformer_softmax_lib', 42 | ext_modules=[ 43 | CUDAExtension( 44 | name='transformer_softmax_lib', 45 | sources=['torch_interface.cpp', 'scaled_masked_softmax.cu', ], 46 | extra_compile_args={ 47 | 'cxx': ['-O3',], 48 | 'nvcc': append_nvcc_threads(['-O3', '--use_fast_math'] + cc_flag) 49 | } 50 | ) 51 | ], 52 | cmdclass={ 53 | 'build_ext': BuildExtension 54 | }) -------------------------------------------------------------------------------- /transformer/fused_softmax/torch_interface.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | torch::Tensor const& mask, 28 | float scale_factor); 29 | 30 | torch::Tensor bwd_cuda( 31 | torch::Tensor const& output_grads, 32 | torch::Tensor const& softmax_results, 33 | float scale_factor); 34 | 35 | int get_batch_per_block_cuda( 36 | int query_seq_len, 37 | int key_seq_len, 38 | int batches, 39 | int attn_heads); 40 | 41 | torch::Tensor fwd( 42 | torch::Tensor const& input, 43 | torch::Tensor const& mask, 44 | float scale_factor) { 45 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 46 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 47 | (input.scalar_type() == at::ScalarType::BFloat16), 48 | "Only fp16 and bf16 are supported"); 49 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 50 | 51 | return fwd_cuda(input, mask, scale_factor); 52 | } 53 | 54 | torch::Tensor bwd( 55 | torch::Tensor const& output_grads, 56 | torch::Tensor const& softmax_results, 57 | float scale_factor) { 58 | 59 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 60 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 61 | 62 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 63 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 64 | "Only fp16 and bf16 are supported"); 65 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 66 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 67 | "Only fp16 and bf16 are supported"); 68 | 69 | return bwd_cuda(output_grads, softmax_results, scale_factor); 70 | } 71 | 72 | int get_batch_per_block( 73 | int query_seq_len, 74 | int key_seq_len, 75 | int batches, 76 | int attn_heads) { 77 | return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); 78 | } 79 | 80 | } // end namespace scaled_masked_softmax 81 | } // end namespace fused_softmax 82 | 83 | 84 | 85 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 86 | m.def("scaled_masked_softmax_forward", 87 | &fused_softmax::scaled_masked_softmax::fwd, 88 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 89 | 90 | m.def("scaled_masked_softmax_backward", 91 | &fused_softmax::scaled_masked_softmax::bwd, 92 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 93 | 94 | m.def("scaled_masked_softmax_get_batch_per_block", 95 | &fused_softmax::scaled_masked_softmax::get_batch_per_block, 96 | "Return Batch per block size." 97 | ); 98 | } -------------------------------------------------------------------------------- /transformer/fused_softmax/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | // #define warpSize 32 13 | 14 | // ELEMENTS_PER_LDG = 4, using float2 copy 4 half data. half2 copy 4 uint8_t 15 | template 16 | __device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src); 17 | 18 | template <> 19 | __device__ __inline__ void copy_vector(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; } 20 | 21 | template <> 22 | __device__ __inline__ void copy_vector(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); } 23 | 24 | template <> 25 | __device__ __inline__ void copy_vector(c10::Half *dst, const c10::Half *src) { *dst = *src; } 26 | 27 | template <> 28 | __device__ __inline__ void copy_vector(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); } 29 | 30 | template <> 31 | __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) { *dst = *src; } 32 | 33 | template <> 34 | __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); } 35 | 36 | int log2_ceil(int value) { 37 | int log2_value = 0; 38 | while ((1 << log2_value) < value) ++log2_value; 39 | return log2_value; 40 | } 41 | 42 | template 43 | struct Add { 44 | __device__ __forceinline__ T operator()(T a, T b) const { 45 | return a + b; 46 | } 47 | }; 48 | 49 | template 50 | struct Max { 51 | __device__ __forceinline__ T operator()(T a, T b) const { 52 | return a < b ? b : a; 53 | } 54 | }; 55 | 56 | template 57 | __device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) 58 | { 59 | #if CUDA_VERSION >= 9000 60 | return __shfl_xor_sync(mask, value, laneMask, width); 61 | #else 62 | return __shfl_xor(value, laneMask, width); 63 | #endif 64 | } 65 | 66 | template class ReduceOp> 67 | __device__ __forceinline__ void warp_reduce(acc_t* sum) { 68 | ReduceOp r; 69 | #pragma unroll 70 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 71 | #pragma unroll 72 | for (int i = 0; i < WARP_BATCH; ++i) { 73 | acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE); 74 | sum[i] = r(sum[i], b); 75 | } 76 | } 77 | } 78 | 79 | // using: DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, function(parameters...)) 80 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 81 | switch(TYPE) \ 82 | { \ 83 | case at::ScalarType::Half: \ 84 | { \ 85 | using scalar_t = at::Half; \ 86 | __VA_ARGS__; \ 87 | break; \ 88 | } \ 89 | case at::ScalarType::BFloat16: \ 90 | { \ 91 | using scalar_t = at::BFloat16; \ 92 | __VA_ARGS__; \ 93 | break; \ 94 | } \ 95 | default: \ 96 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 97 | } 98 | -------------------------------------------------------------------------------- /transformer/fused_softmax/warp_example/README.md: -------------------------------------------------------------------------------- 1 | 2 | An example shows how to use __shfl_xor_sync to get summary. 3 | 4 | Compile: 5 | ```bash 6 | nvcc -lcuda warp_reduce.cu -o test 7 | ``` 8 | 9 | Run: 10 | ``` 11 | ./test 12 | ``` -------------------------------------------------------------------------------- /transformer/fused_softmax/warp_example/warp_reduce.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * warp reduce example. 3 | * Author: kevin.xie 4 | * Email: kaiyuanxie@yeah.net 5 | * */ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | template void check(T result, char const *const func, const char *const file, int const line) 13 | { 14 | if (result) { 15 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), 16 | cudaGetErrorString(result), func); 17 | exit(EXIT_FAILURE); 18 | } 19 | } 20 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 21 | 22 | template 23 | struct Add { 24 | __device__ __forceinline__ T operator()(T a, T b) const { 25 | return a + b; 26 | } 27 | }; 28 | 29 | template 30 | struct Max { 31 | __device__ __forceinline__ T operator()(T a, T b) const { 32 | return a < b ? b : a; 33 | } 34 | }; 35 | 36 | template 37 | __device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) 38 | { 39 | #if CUDA_VERSION >= 9000 40 | return __shfl_xor_sync(mask, value, laneMask, width); 41 | #else 42 | return __shfl_xor_sync(mask, value, laneMask, width); 43 | //return __shfl_xor(value, laneMask, width); 44 | #endif 45 | } 46 | 47 | template class ReduceOp> 48 | __device__ __forceinline__ void warp_reduce(acc_t* sum) { 49 | ReduceOp r; 50 | #pragma unroll 51 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 52 | #pragma unroll 53 | for (int i = 0; i < WARP_BATCH; ++i) { 54 | acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE); 55 | sum[i] = r(sum[i], b); 56 | } 57 | } 58 | } 59 | 60 | #define WARP_BATCH 1 61 | 62 | template 63 | __global__ void launcher(data_t* src, int nums) { 64 | data_t tmp[WARP_BATCH] = {0}; 65 | int localIdx= threadIdx.x; 66 | while (localIdx < nums) { 67 | tmp[0] += src[localIdx]; 68 | localIdx += gridDim.x * blockDim.x; 69 | } 70 | 71 | warp_reduce(tmp); 72 | src[threadIdx.x] = tmp[0]; 73 | } 74 | 75 | int main() { 76 | unsigned int total_size = 100; 77 | float* input_data = (float*) malloc(sizeof(float) * total_size); 78 | float* device_ptr; 79 | checkCudaErrors(cudaMalloc((void**)&device_ptr, sizeof(float) *total_size)); 80 | for (int i =0; i < 90; ++i) { 81 | input_data[i] = i * 2; 82 | } 83 | checkCudaErrors(cudaMemcpy(device_ptr, input_data, total_size * sizeof(float), cudaMemcpyHostToDevice)); 84 | launcher<<<1, 32>>>(device_ptr, total_size); 85 | checkCudaErrors(cudaMemcpy(input_data, device_ptr, total_size * sizeof(float), cudaMemcpyDeviceToHost)); 86 | printf("Print all data:\n"); 87 | for (int i = 0;i < 10; ++i) { 88 | for (int k = 0; k < 10 ;++k) { 89 | printf("%f " ,input_data[i*10 + k]); 90 | } 91 | printf("\n"); 92 | } 93 | checkCudaErrors(cudaFree (device_ptr)); 94 | free(input_data); 95 | return 0; 96 | } --------------------------------------------------------------------------------