├── README.md
├── common_methods
    ├── README.md
    ├── print_any.cu
    ├── shared_mem.cu
    ├── streams.cu
    ├── threads_hierarchy_calc.cu
    ├── um_demo.cu
    └── zero_copy.cu
├── matrix_multiply
    ├── Makefile
    ├── README.md
    ├── imgs
    │   ├── 2d_block_split.png
    │   ├── extended_cases.png
    │   ├── info.txt
    │   ├── matmul_use_shm.png
    │   ├── matrix_in_mem.png
    │   ├── perf_v100.png
    │   └── sub_matrix_mul.png
    ├── matMul.h
    ├── matMul1DKernel.cu
    ├── matMul2DKernel.cu
    ├── matMulCublasKernel.cu
    └── testMatMul.cu
├── memory_opt
    ├── Makefile
    ├── README.md
    ├── device2Device.cu
    ├── hostAndDeviceTrans.cu
    ├── memoryOpt.h
    ├── run.sh
    ├── sharedMemory.cu
    ├── timer.h
    └── zeroCopy.cu
├── nccl
    ├── Makefile
    ├── README.md
    ├── alltoall.cu
    ├── comm.h
    ├── multi_devices_per_thread.cu
    ├── nccl_with_mpi.cu
    ├── node_client.cu
    ├── node_server.cu
    ├── nonblocking_double_streams.cu
    └── one_device_per_thread.cu
├── pytorch
    ├── torch1.13_mem_rationale
    │   ├── CUDACachingAllocator.cpp
    │   ├── CUDACachingAllocator.h
    │   ├── Makefile
    │   ├── README.md
    │   ├── TestAllocator.cpp
    │   └── llvmMathExtras.h
    ├── torch_ext
    │   ├── README.md
    │   ├── binding_examples
    │   │   ├── README.md
    │   │   ├── basics
    │   │   │   ├── classes.cc
    │   │   │   ├── classes_call.py
    │   │   │   ├── function_call.py
    │   │   │   └── functions.cc
    │   │   └── bind_practices
    │   │   │   ├── classes_lib.cc
    │   │   │   ├── classes_lib.h
    │   │   │   ├── classes_lib_bind.cc
    │   │   │   ├── classes_practice.py
    │   │   │   ├── functions_lib.cc
    │   │   │   ├── functions_lib.h
    │   │   │   ├── functions_lib_bind.cc
    │   │   │   └── functions_practice.py
    │   ├── easy_jit
    │   │   ├── demo.cu
    │   │   └── run.py
    │   ├── easy_load
    │   │   ├── run_inline_v1.py
    │   │   ├── run_inline_v2.py
    │   │   └── run_inline_v3.py
    │   ├── easy_setup
    │   │   ├── my_extension.cpp
    │   │   ├── run.py
    │   │   └── setup.py
    │   ├── lltm_demo
    │   │   ├── lltm_cuda.cpp
    │   │   ├── lltm_cuda_kernel.cu
    │   │   ├── run_baseline.py
    │   │   ├── run_custom_lltm.py
    │   │   └── setup.py
    │   └── sum_array
    │   │   ├── glueCode.cpp
    │   │   ├── run.py
    │   │   ├── sumArray.cu
    │   │   └── sumArray.h
    └── torch_mem_snapshot
    │   ├── README.md
    │   ├── block_fragment.py
    │   ├── predict_text_original_code.py
    │   ├── predict_text_with_snapshot_example.py
    │   ├── segment.py
    │   ├── transformer_profile.py
    │   └── transformer_snapshot.py
└── transformer
    └── fused_softmax
        ├── README.md
        ├── scaled_masked_softmax.cu
        ├── scaled_masked_softmax.h
        ├── setup.py
        ├── torch_interface.cpp
        ├── utils.h
        └── warp_example
            ├── README.md
            └── warp_reduce.cu


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/README.md


--------------------------------------------------------------------------------
/common_methods/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## compile
  3 | Print information in kernel:
  4 | ```
  5 | $ nvcc -lcuda print_any.cu -o print_any
  6 | ```
  7 | 
  8 | Managed memory:
  9 | ```
 10 | $ nvcc -lcuda um_demo.cu -o um_demo
 11 | ```
 12 | 
 13 | Zero copy:
 14 | ```
 15 | $ nvcc -lcuda -I../memory_opt/ zero_copy.cu -o zero_run
 16 | ```
 17 | 
 18 | Shared memory:
 19 | ```
 20 | $ nvcc -lcuda -I../memory_opt/ shared_mem.cu -o smem_run
 21 | ```
 22 | 
 23 | Multi streams:
 24 | ```
 25 | $ nvcc -lcuda streams.cu -o streamd_demo
 26 | ```
 27 | 
 28 | ## run
 29 | ```
 30 | $ ./print_any
 31 | $ ./um_demo
 32 | ```
 33 | 
 34 | ## profile
 35 | ### CUDA nvprof
 36 | Arch <= 7.5  e.g. Volta.
 37 | ```
 38 | $ nvprof ./um_demo
 39 | ```
 40 | 
 41 | Arch >= 8.0 e.g. Ampere:
 42 | ```
 43 | $ nsys nvprof um_demo
 44 | ```
 45 | ### gprof
 46 | 
 47 | step1: compile with -pg
 48 | ```
 49 | $ nvcc -pg -lcuda um_demo.cu -o um_demo
 50 | ```
 51 | step2: run exe
 52 | ```
 53 | $ ./um_demo
 54 | ```
 55 | (will get a file: gmon.out)
 56 | 
 57 | step3: print info
 58 | ```
 59 | $ gprof ./um_demo
 60 | ```
 61 | Result e.g.:
 62 | ```
 63 | Flat profile:
 64 | 
 65 | Each sample counts as 0.01 seconds.
 66 |   %   cumulative   self              self     total
 67 |  time   seconds   seconds    calls  ns/call  ns/call  name
 68 |  62.50      0.03     0.03  1048576    23.84    23.84  std::fmax(float, float)
 69 |  25.00      0.04     0.01                             main
 70 |  12.50      0.04     0.01  1048576     4.77     4.77  std::fabs(float)
 71 |   0.00      0.04     0.00        2     0.00     0.00  cudaError cudaMallocManaged<float>(float**, unsigned long, unsigned int)
 72 |   0.00      0.04     0.00        2     0.00     0.00  dim3::dim3(unsigned int, unsigned int, unsigned int)
 73 |   0.00      0.04     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
 74 |   0.00      0.04     0.00        1     0.00     0.00  cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*)
 75 |   0.00      0.04     0.00        1     0.00     0.00  __device_stub__Z3addiPfS_(int, float*, float*)
 76 |   0.00      0.04     0.00        1     0.00     0.00  add(int, float*, float*)
 77 |   0.00      0.04     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
 78 |   0.00      0.04     0.00        1     0.00     0.00  ____nv_dummy_param_ref(void*)
 79 |   0.00      0.04     0.00        1     0.00     0.00  __sti____cudaRegisterAll()
 80 |   0.00      0.04     0.00        1     0.00     0.00  __nv_cudaEntityRegisterCallback(void**)
 81 |   0.00      0.04     0.00        1     0.00     0.00  __nv_save_fatbinhandle_for_managed_rt(void**)
 82 | 
 83 |  %         the percentage of the total running time of the
 84 | time       program used by this function.
 85 | 
 86 | cumulative a running sum of the number of seconds accounted
 87 |  seconds   for by this function and those listed above it.
 88 | 
 89 |  self      the number of seconds accounted for by this
 90 | seconds    function alone.  This is the major sort for this
 91 |            listing.
 92 | 
 93 | calls      the number of times this function was invoked, if
 94 |            this function is profiled, else blank.
 95 | 
 96 |  self      the average number of milliseconds spent in this
 97 | ms/call    function per call, if this function is profiled,
 98 |            else blank.
 99 | 
100 |  total     the average number of milliseconds spent in this
101 | ms/call    function and its descendents per call, if this
102 |            function is profiled, else blank.
103 | 
104 | name       the name of the function.  This is the minor sort
105 |            for this listing. The index shows the location of
106 |            the function in the gprof listing. If the index is
107 |            in parenthesis it shows where it would appear in
108 |            the gprof listing if it were to be printed.
109 | ```
110 | 
111 | 


--------------------------------------------------------------------------------
/common_methods/print_any.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include "cuda_runtime.h"
 3 | #define N 8
 4 | 
 5 | 
 6 | __global__ void kernel(int mark)
 7 | {
 8 |     if (blockIdx.x == 0 && threadIdx.x == 0) {
 9 |         printf("  === kernel %d run info: gridDim.x: %d, blockDim.x: %d ===\n", \
10 |         mark, gridDim.x, blockDim.x);
11 |     }
12 |     __syncthreads();
13 |     printf("    blockIdx.x: %d threadIdx.x: %d\n", blockIdx.x,  threadIdx.x);
14 | }
15 | 
16 | __global__ void kernelCalcuDim(int dimNum)
17 | {
18 |     if (threadIdx.x + threadIdx.y + threadIdx.z + blockIdx.x + blockIdx.y + blockIdx.z == 0) {
19 |         printf("============= The grid shape: gridDim.x: %d gridDim.y: %d gridDim.z: %d\n",\
20 |         gridDim.x, gridDim.y, gridDim.z);
21 |         printf("============= The block shape: blockDim.x: %d blockDim.y: %d blockDim.z: %d\n",\
22 |         blockDim.x, blockDim.y, blockDim.z);
23 |     }
24 |     __syncthreads();
25 |     int offset = 0;
26 |     int x, y, z;
27 |     switch (dimNum) {
28 |         case 1:
29 |             offset = threadIdx.x + blockIdx.x * blockDim.x;
30 |             break;
31 |         case 2:
32 |             x = threadIdx.x + blockIdx.x * blockDim.x;
33 |             y = threadIdx.y + blockIdx.y * blockDim.y;
34 |             offset = x + y * blockDim.x * gridDim.x;
35 |             // method 2:
36 |             // offset = threadIdx.x + blockDim.x * threadIdx.y + \
37 |             // (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y);
38 |             break;
39 |         case 3:
40 |             x = threadIdx.x + blockIdx.x * blockDim.x;
41 |             y = threadIdx.y + blockIdx.y * blockDim.y;
42 |             z = threadIdx.z + blockIdx.z * blockDim.z;
43 |             offset = x + y * blockDim.x * gridDim.x + z * blockDim.x * blockDim.y * gridDim.x * gridDim.y;
44 |             break;
45 |         default:
46 |             break;
47 |     }
48 | 
49 |     printf("    blockIdx: x=%d y= %d z=%d threadIdx x=%d y=%d z=%d; offset= %d\n",\
50 |     blockIdx.x, blockIdx.y, blockIdx.z,  threadIdx.x, threadIdx.y, threadIdx.z, offset);
51 | }
52 | 
53 | 
54 | int main()
55 | {
56 |     printf("Case0: the diff between <<<1, N>>> with <<<N, 1>>>\n");
57 |     printf(" Kernel 0 invocation with N threads (1 blocks, N thread/block) N =%d\n" , N);
58 |     kernel<<<1, N>>>(0);
59 |     cudaDeviceSynchronize();
60 |     printf(" Kernel 1 invocation with N threads (N blocks, 1 thread/block) N =%d\n" , N);
61 |     kernel<<<N, 1>>>(1);
62 |     cudaDeviceSynchronize();
63 |     printf("\n\n");
64 | 
65 |     printf("Case1: 1 dimension, grid: 2  block: 2 \n");
66 |     kernelCalcuDim<<<2, 2>>>(1);
67 |     cudaDeviceSynchronize();
68 |     printf("\n");
69 | 
70 |     printf("Case2: 2 dimension, grid: 2 x 1  block: 2 x 2 \n");
71 |     dim3 gridSize2D(2, 1);
72 |     dim3 blockSize2D(2, 2);
73 |     kernelCalcuDim<<<gridSize2D, blockSize2D>>>(2);
74 |     cudaDeviceSynchronize();
75 |     printf("\n");
76 | 
77 |     printf("Case3: 3 dimension, grid: 2 x 1 x 2 block: 1 x 2 x 2 \n");
78 |     dim3 gridSize3D(2, 1, 2);
79 |     dim3 blockSize3D(1, 2, 2);
80 |     kernelCalcuDim<<<gridSize3D, blockSize3D>>>(3);
81 |     cudaDeviceSynchronize();
82 |     return 0;
83 | }


--------------------------------------------------------------------------------
/common_methods/shared_mem.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  Array sum calculation with or without shared memory in CUDA kernel.
  3 |  *
  4 |  *  This demo code might be stale with the development of CUDA.
  5 |  *  To use the latest API operations, you could see NVIDIA guide:
  6 |  *     https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
  7 |  *     https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
  8 |  *
  9 |  *   Author: kevin.xie
 10 |  *   Email: kaiyuanxie@yeah.net
 11 |  * */
 12 | 
 13 | #include <memory>
 14 | 
 15 | #include "memoryOpt.h"
 16 | #include "timer.h"
 17 | 
 18 | #define THREAD_PER_BLOCK 256
 19 | 
 20 | double sumArrayInBlockCPU(float *arrData, const unsigned int dataSize)
 21 | {
 22 |     /*  This function might help you understand the process of CUDA array sum. */
 23 |     float *blockData = (float *)calloc(dataSize / THREAD_PER_BLOCK, sizeof(float));
 24 |     int blockSize = dataSize / THREAD_PER_BLOCK; // get integer part
 25 |     int idxMax = blockSize * THREAD_PER_BLOCK;
 26 | 
 27 |     // Split the array into blocks and sum the blocks one by one.
 28 |     for (int i = 0; i < blockSize; i++) {
 29 |         for (int j = 0; j < THREAD_PER_BLOCK; j++) {
 30 |             int idx = i * THREAD_PER_BLOCK + j;
 31 |             while (idx < dataSize) {
 32 |                 blockData[i] += arrData[idx];
 33 |                 idx += idxMax;
 34 |             }
 35 |         }
 36 |     }
 37 | 
 38 |     double rst = 0.0;
 39 |     // sum the all blocks result;
 40 |     for (int i = 0; i < blockSize; ++i) {
 41 |         rst += blockData[i];
 42 |     }
 43 |     return rst;
 44 | }
 45 | 
 46 | __device__ int countSHM = 0;
 47 | __global__ void arraySumWithSHMKernel(float *arrData, const int dataSize)
 48 | {
 49 |     __shared__ float shm[THREAD_PER_BLOCK];
 50 |     int thIdx = threadIdx.x + blockIdx.x * blockDim.x;
 51 |     if (thIdx == 0) {
 52 |         countSHM = 0;
 53 |         __threadfence();
 54 |     }
 55 |     float val = 0.0;
 56 |     while (thIdx < dataSize) {
 57 |         val += arrData[thIdx];
 58 |         thIdx += blockDim.x * gridDim.x;
 59 |     }
 60 |     shm[threadIdx.x] = val;
 61 |     __syncthreads();
 62 | 
 63 |     for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
 64 |         if (threadIdx.x < i)
 65 |             shm[threadIdx.x] += shm[threadIdx.x + i];
 66 |         __syncthreads();
 67 |     }
 68 | 
 69 |     __syncthreads();
 70 |     bool isLast = false;
 71 |     thIdx = threadIdx.x + blockIdx.x * blockDim.x;
 72 |     if (threadIdx.x == 0) {
 73 |         arrData[blockIdx.x] = shm[0];
 74 |         __threadfence();
 75 |         int value = atomicAdd(&countSHM, 1);
 76 |         isLast = (value == gridDim.x - 1);
 77 |     }
 78 |     isLast = __syncthreads_or(isLast);
 79 |     if (isLast) {
 80 |         shm[threadIdx.x] = threadIdx.x < gridDim.x ? arrData[threadIdx.x] : 0;
 81 |         __syncthreads();
 82 |         for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
 83 |             if (threadIdx.x < i)
 84 |                 shm[threadIdx.x] += shm[threadIdx.x + i];
 85 |             __syncthreads();
 86 |         }
 87 |         __syncthreads();
 88 |         if (threadIdx.x == 0)
 89 |             arrData[0] = shm[0];
 90 |     }
 91 |     __syncthreads();
 92 | }
 93 | 
 94 | __global__ void arraySumKernel(float *arrData, float *oData, const int dataSize)
 95 | {
 96 |     // The function needed to run twice if dataSize > threads per block.
 97 | 
 98 |     int thIdx = threadIdx.x + blockIdx.x * blockDim.x;
 99 |     float val = 0.0;
100 |     while (thIdx < dataSize) {
101 |         val += arrData[thIdx];
102 |         thIdx += blockDim.x * gridDim.x;
103 |     }
104 |     thIdx = threadIdx.x + blockIdx.x * blockDim.x;
105 |     arrData[thIdx] = val;
106 |     __syncthreads();
107 | 
108 |     // Reduce process:
109 |     for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
110 |         if (threadIdx.x < i)
111 |             arrData[thIdx] += arrData[thIdx + i];
112 |         __syncthreads();
113 |     }
114 |     __syncthreads();
115 | 
116 |     if (threadIdx.x == 0) {
117 |         oData[blockIdx.x] = arrData[thIdx];
118 |     }
119 | }
120 | 
121 | float sumArrayGPU(const unsigned int dataSize, unsigned int iterNumber, bool useSHM)
122 | {
123 |     int memSize = sizeof(float) * dataSize;
124 |     float *hInData = (float *)malloc(memSize);
125 |     if (hInData == 0) {
126 |         fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
127 |         exit(EXIT_FAILURE);
128 |     }
129 | 
130 |     // Get the correct result for verifying.
131 |     double sum = sumArrayInBlockCPU(hInData, dataSize);
132 | 
133 |     float *devInData, *devOutData;
134 |     float devRst;
135 |     float elapsedTimeInMs = 0.0f;
136 |     if (!useSHM) {
137 |         checkCudaErrors(cudaMalloc((void **)&devOutData, max(dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK)));
138 |     }
139 |     checkCudaErrors(cudaMalloc((void **)&devInData, memSize));
140 |     checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
141 | 
142 |     cudaEvent_t start, stop;
143 | 
144 |     for (int i = 0; i < iterNumber; i++) {
145 |         float onceTime = 0.0;
146 |         checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
147 |         if (useSHM) {
148 |             TIME_ELAPSE((arraySumWithSHMKernel<<<dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(devInData, dataSize)),
149 |                         onceTime, start, stop);
150 |         } else {
151 |             // Run twice to get the result.
152 |             TIME_ELAPSE(
153 |                 (arraySumKernel<<<dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(devInData, devOutData, dataSize)),
154 |                 onceTime, start, stop);
155 |             elapsedTimeInMs += onceTime;
156 |             TIME_ELAPSE((arraySumKernel<<<1, THREAD_PER_BLOCK>>>(devOutData, devOutData, dataSize / THREAD_PER_BLOCK)),
157 |                         onceTime, start, stop);
158 |         }
159 |         checkCudaErrors(cudaDeviceSynchronize());
160 |         elapsedTimeInMs += onceTime;
161 |     }
162 | 
163 |     if (useSHM) {
164 |         checkCudaErrors(cudaMemcpy(&devRst, devInData, sizeof(float), cudaMemcpyDeviceToHost));
165 |     } else {
166 |         checkCudaErrors(cudaMemcpy(&devRst, devOutData, sizeof(float), cudaMemcpyDeviceToHost));
167 |     }
168 | 
169 |     if (fabs(devRst - sum) > 1.e-6) {
170 |         printf("Result error! GPU: %f CPU: %f\n", devRst, sum);
171 |         exit(EXIT_FAILURE);
172 |     }
173 |     free(hInData);
174 |     checkCudaErrors(cudaFree(devInData));
175 |     if (!useSHM) {
176 |         checkCudaErrors(cudaFree(devOutData));
177 |     }
178 | 
179 |     return elapsedTimeInMs / iterNumber;
180 | }
181 | 
182 | int main(int argc, char **argv)
183 | {
184 |     printf("[Shared Memory Application: Array Sum.] - Starting...\n");
185 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
186 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
187 |         printf("      -size=The size of numElements for testing in bytes. Default: 5000)\n");
188 |         printf("      -iter=n Iteration numbers of trans. Default:100 \n");
189 |         printf("Note: The size has a limitation. Consider float type range.)\n");
190 |         exit(EXIT_SUCCESS);
191 |     }
192 |     unsigned int numElements = 5000;
193 |     unsigned int gpuID = 0;
194 |     unsigned int iterNumber = 100;
195 | 
196 |     if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
197 |         gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
198 |     }
199 |     if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
200 |         numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size");
201 |     }
202 |     if (numElements < 256 || numElements > 10000) {
203 |         printf("The size of numElements is not allowed! Support range:256~10000.\n");
204 |         printf("You could modify the source code to extend the range.\n");
205 |         exit(EXIT_FAILURE);
206 |     }
207 |     if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
208 |         iterNumber = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
209 |     }
210 | 
211 |     checkCudaErrors(cudaSetDevice(gpuID));
212 |     printf("Sum array with shared memory.       Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, true));
213 |     printf("Sum array without shared memory.    Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, false));
214 | 
215 |     exit(EXIT_SUCCESS);
216 | }


--------------------------------------------------------------------------------
/common_methods/threads_hierarchy_calc.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  threads hierarchy calculation example.
  3 |  *  Author: kevin.xie
  4 |  *  Email: kaiyuanxie@yeah.net
  5 |  * */
  6 | 
  7 | #include <cmath>
  8 | #include <cstdio>
  9 | #include <cuda_runtime.h>
 10 | 
 11 | const float EPSILON = 1e-6;
 12 | 
 13 | bool areFloatsEqual(float a, float b) {
 14 |     return std::fabs(a - b) < EPSILON;
 15 | }
 16 | 
 17 | template <typename T> void check(T result, char const *const func, const char *const file, int const line)
 18 | {
 19 |     if (result) {
 20 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
 21 |                 cudaGetErrorString(result), func);
 22 |         exit(EXIT_FAILURE);
 23 |     }
 24 | }
 25 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 26 | 
 27 | 
 28 | __global__ void kernelAddOne3D3D(float *input, int dataNum)
 29 | {
 30 |     int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
 31 |     int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
 32 |     int oneBlockSize = blockDim.x*blockDim.y*blockDim.z;
 33 |     int i = threadInBlock + oneBlockSize*blockInGrid;
 34 |     while(i <  dataNum) {
 35 |         input[i] += 1;
 36 |         i += oneBlockSize * gridDim.x*gridDim.y*gridDim.z;
 37 |     }
 38 | }
 39 | 
 40 | 
 41 | __global__ void kernelAddOne2D2D(float *input, int dataNum)
 42 | {
 43 |     // int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
 44 |     // int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
 45 |     // int oneBlockSize = blockDim.x*blockDim.y*blockDim.z;
 46 |     // int i = threadInBlock + oneBlockSize*blockInGrid;
 47 |     // when:
 48 |     // threadIdx.z = 0; blockIdx.z = 0;
 49 |     // blockDim.z = 1; gridDim.z = 1;
 50 |     // then:
 51 |     // int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x;
 52 |     // int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x;
 53 |     // int oneBlockSize = blockDim.x*blockDim.y;
 54 |     int i = threadIdx.x + threadIdx.y*blockDim.x + blockDim.x*blockDim.y*(blockIdx.x + blockIdx.y*gridDim.x);
 55 | 
 56 |     while(i <  dataNum) {
 57 |         input[i] += 1;
 58 |         i +=  blockDim.x*blockDim.y*gridDim.x*gridDim.y;
 59 |     }
 60 |     // thread overflow offset = blockDim.x*blockDim.y*gridDim.x*gridDim.y;
 61 | }
 62 | 
 63 | __global__ void printIdx2D2D()
 64 | {
 65 |     int i = threadIdx.x + threadIdx.y*blockDim.x + blockDim.x*blockDim.y*(blockIdx.x + blockIdx.y*gridDim.x);
 66 |     printf("Global idx %d, threadIdx.x: %d, threadIdx.y: %d threadIdx.z: %d, blockIdx.x: %d, blockIdx.y: %d,  blockIdx.z: %d \n",\
 67 |     i, threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z);
 68 | }
 69 | 
 70 | __global__ void kernelAddOne1D1D(float *input, int dataNum)
 71 | {
 72 |     // int threadInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
 73 |     // int blockInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
 74 |     // int oneBlockSize = blockDim.x*blockDim.y*blockDim.z;
 75 |     // int i = threadInBlock + oneBlockSize*blockInGrid;
 76 |     // when:
 77 |     // threadIdx.y = 0; threadIdx.z = 0; blockIdx.y= 0;  blockIdx.z = 0;
 78 |     // blockDim.y = 1; blockDim.z = 1; gridDim.y = 1; gridDim.z = 1;
 79 |     // then:
 80 |     // int threadInBlock = threadIdx.x;
 81 |     // int blockInGrid = blockIdx.x;
 82 |     // int oneBlockSize = blockDim.x;
 83 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 84 | 
 85 |     while(i <  dataNum) {
 86 |         input[i] += 1;
 87 |         i += blockDim.x*gridDim.x;
 88 |     }
 89 |     // thread overflow offset = blockDim.x*gridDim.x;
 90 | }
 91 | 
 92 | #define TOTAL_SIZE 5000
 93 | #define N 4
 94 | #define M 4
 95 | using kernel = void (*)(float *, int);
 96 | 
 97 | bool test(kernel func, dim3 BlocksPerGrid, dim3 threadsPerBlock) {
 98 |     unsigned int totalSize = TOTAL_SIZE;
 99 |     float* hostData = (float*) malloc(sizeof(float) * totalSize);
100 |     float* checkData = (float*) malloc(sizeof(float) * totalSize);
101 |     float* devicePtr;
102 |     checkCudaErrors(cudaMalloc((void**)&devicePtr, sizeof(float) * totalSize));
103 |     for (int i =0; i < totalSize; ++i) {
104 |         hostData[i] = i;
105 |         checkData[i] = i + 1;
106 |     }
107 |     checkCudaErrors(cudaMemcpy(devicePtr, hostData,  totalSize * sizeof(float), cudaMemcpyHostToDevice));
108 |     func<<<BlocksPerGrid, threadsPerBlock>>>(devicePtr, totalSize);
109 |     checkCudaErrors(cudaMemcpy(hostData, devicePtr, totalSize * sizeof(float), cudaMemcpyDeviceToHost));
110 |     // check result:
111 |     bool rst = true;
112 |     for (int i =0; i < totalSize; ++i) {
113 |         if (!areFloatsEqual(checkData[i], hostData[i])) {
114 |             rst = false;
115 |             printf("The result not equal in data index %d. expect:%f  result:%f\n", i, checkData[i], hostData[i]);
116 |             break;
117 |         }
118 |     }
119 |     checkCudaErrors(cudaFree (devicePtr));
120 |     free(hostData);
121 |     free(checkData);
122 |     return rst;
123 | }
124 | 
125 | 
126 | int main() {
127 |     printf("This example is for threads hierachy calculation.\n");
128 |     // 3D3D:
129 |     dim3 BlocksPerGrid(N, N, N);  // 对应gridDim.x、gridDim.y、gridDim.z
130 |     dim3 threadsPerBlock(M, M, M);  // 对应blockDim.x、blockDim.y、blockDim.z
131 |     // test(kernelAddOne3D3D, BlocksPerGrid, threadsPerBlock)
132 | 
133 |     // 2D2D:
134 |     dim3 BlocksPerGrid2D(N, N);
135 |     dim3 threadsPerBlock2D(M, M);
136 |     // test(kernelAddOne2D2D, BlocksPerGrid2D, threadsPerBlock2D)
137 | 
138 |     // 1D1D:
139 |     // test(kernelAddOne1D1D, N, M)
140 | 
141 |     // print the idx in threads, 2D2D example:
142 |     printIdx2D2D<<<dim3(3, 3), dim3(2,2)>>>();
143 | 
144 |     bool rst = test(kernelAddOne3D3D, BlocksPerGrid, threadsPerBlock) && \
145 |     test(kernelAddOne2D2D, BlocksPerGrid2D, threadsPerBlock2D) && \
146 |     test(kernelAddOne1D1D, N, M);
147 |     if(rst) {
148 |         printf("The test OK.\n");
149 |     } else {
150 |         printf("The test Failed.\n");
151 |     }
152 |     return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/common_methods/um_demo.cu:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | 
 3 | #include <iostream>
 4 | 
 5 | // CUDA kernel to add elements of two arrays
 6 | __global__ void add(int n, float *x, float *y)
 7 | {
 8 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 9 |     int stride = blockDim.x * gridDim.x;
10 |     while (index < n) {
11 |         y[index] = x[index] + y[index];
12 |         index += stride;
13 |     }
14 | }
15 | 
16 | int main(void)
17 | {
18 |     int N = 1 << 20;
19 |     float *x, *y;
20 | 
21 |     // Allocate Unified Memory -- accessible from CPU or GPU
22 |     cudaMallocManaged(&x, N * sizeof(float));
23 |     cudaMallocManaged(&y, N * sizeof(float));
24 | 
25 |     // initialize x and y arrays on the host
26 |     for (int i = 0; i < N; i++) {
27 |         x[i] = 1.0f;
28 |         y[i] = 2.0f;
29 |     }
30 | 
31 |     // Launch kernel on 1M elements on the GPU
32 |     int blockSize = 256;
33 |     int numBlocks = (N + blockSize - 1) / blockSize;
34 |     add<<<numBlocks, blockSize>>>(N, x, y);
35 | 
36 |     // Wait for GPU to finish before accessing on host
37 |     cudaDeviceSynchronize();
38 | 
39 |     // Check for errors (all values should be 3.0f)
40 |     float maxError = 0.0f;
41 |     for (int i = 0; i < N; i++)
42 |         maxError = fmax(maxError, fabs(y[i] - 3.0f));
43 |     std::cout << "Max error: " << maxError << std::endl;
44 | 
45 |     // Free memory
46 |     cudaFree(x);
47 |     cudaFree(y);
48 |     return 0;
49 | }


--------------------------------------------------------------------------------
/common_methods/zero_copy.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  *  zero copy using in vectorAdd case.
  4 |  *
  5 |  *  This demo code might be stale with the development of CUDA.
  6 |  *  To use the latest API operations, you could see NVIDIA guide:
  7 |  *      https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
  8 |  *      https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
  9 |  *
 10 |  *  Author: kevin.xie
 11 |  *  Email: kaiyuanxie@yeah.net
 12 |  */
 13 | 
 14 | #include "memoryOpt.h"
 15 | #include "timer.h"
 16 | 
 17 | __global__ void vectorAdd(const float *A, const float *B, float *C, const int numElements)
 18 | {
 19 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 20 |     if (i < numElements) {
 21 |         C[i] = A[i] + B[i] + 0.0f;
 22 |     }
 23 | }
 24 | 
 25 | float vectorAddViaGlobalMemory(const unsigned int numElements, const unsigned int iterNum)
 26 | {
 27 | 
 28 |     StopWatchInterface *timer = NULL;
 29 |     float elapsedTimeInMs = 0.0f;
 30 |     float throughputInGBs = 0.0f;
 31 | 
 32 |     sdkCreateTimer(&timer);
 33 |     size_t memSize = numElements * sizeof(float);
 34 | 
 35 |     // Launch the Vector Add CUDA Kernel
 36 |     int threadsPerBlock = 256;
 37 |     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 38 | 
 39 |     // Allocate the host input vector A, B, C
 40 |     float *h_A = (float *)malloc(memSize);
 41 |     float *h_B = (float *)malloc(memSize);
 42 |     float *h_C = (float *)malloc(memSize);
 43 | 
 44 |     // Verify that allocations succeeded
 45 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
 46 |         fprintf(stderr, "Failed to allocate host vectors!\n");
 47 |         exit(EXIT_FAILURE);
 48 |     }
 49 | 
 50 |     // Initialize the host input vectors
 51 |     for (int i = 0; i < numElements; ++i) {
 52 |         h_A[i] = rand() / (float)RAND_MAX;
 53 |         h_B[i] = rand() / (float)RAND_MAX;
 54 |     }
 55 | 
 56 |     // Allocate the device input vector:
 57 |     float *d_A = NULL;
 58 |     float *d_B = NULL;
 59 |     float *d_C = NULL;
 60 |     checkCudaErrors(cudaMalloc((void **)&d_A, memSize));
 61 |     checkCudaErrors(cudaMalloc((void **)&d_B, memSize));
 62 |     checkCudaErrors(cudaMalloc((void **)&d_C, memSize));
 63 | 
 64 |     for (unsigned int i = 0; i < iterNum; i++) {
 65 |         sdkStartTimer(&timer);
 66 |         checkCudaErrors(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice));
 67 |         checkCudaErrors(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice));
 68 |         vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
 69 |         checkCudaErrors(cudaGetLastError());
 70 |         // Copy the device result vector in device memory to the host result vector in host memory.
 71 |         checkCudaErrors(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost));
 72 |         sdkStopTimer(&timer);
 73 |         elapsedTimeInMs += sdkGetTimerValue(&timer);
 74 |         sdkResetTimer(&timer);
 75 |     }
 76 | 
 77 |     // Verify that the result vector is correct
 78 |     for (int i = 0; i < numElements; ++i) {
 79 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
 80 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
 81 |             exit(EXIT_FAILURE);
 82 |         }
 83 |     }
 84 | 
 85 |     // calculate throughput in GB/s. Note: use 1000(not 1024)unit.
 86 |     double time_s = elapsedTimeInMs / 1e3;
 87 |     throughputInGBs = (memSize * (float)iterNum) / (double)1e9;
 88 |     throughputInGBs = throughputInGBs / time_s;
 89 |     sdkDeleteTimer(&timer);
 90 | 
 91 |     // Free device global memory
 92 |     checkCudaErrors(cudaFree(d_A));
 93 |     checkCudaErrors(cudaFree(d_B));
 94 |     checkCudaErrors(cudaFree(d_C));
 95 | 
 96 |     // Free host memory
 97 |     free(h_A);
 98 |     free(h_B);
 99 |     free(h_C);
100 | 
101 |     return throughputInGBs;
102 | }
103 | 
104 | float vectorAddViaZeroCopy(const unsigned int numElements, const unsigned int iterNum)
105 | {
106 | 
107 |     StopWatchInterface *timer = NULL;
108 |     float elapsedTimeInMs = 0.0f;
109 |     float throughputInGBs = 0.0f;
110 | 
111 |     sdkCreateTimer(&timer);
112 |     size_t memSize = numElements * sizeof(float);
113 | 
114 |     // Launch the Vector Add CUDA Kernel
115 |     int threadsPerBlock = 256;
116 |     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
117 | 
118 |     checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
119 |     // Allocate the host input vector A, B, C
120 |     float *h_A = NULL;
121 |     float *h_B = NULL;
122 |     float *h_C = NULL;
123 |     float *map_A, *map_B, *map_C;
124 |     // Policy1:
125 |     // checkCudaErrors(cudaMallocHost((void **)&h_A, memSize));
126 |     // checkCudaErrors(cudaMallocHost((void **)&h_B, memSize));
127 |     // checkCudaErrors(cudaMallocHost((void **)&h_C, memSize));
128 | 
129 |     // Policy2:
130 |     checkCudaErrors(cudaHostAlloc((void **)&h_A, memSize, cudaHostAllocMapped));
131 |     checkCudaErrors(cudaHostAlloc((void **)&h_B, memSize, cudaHostAllocMapped));
132 |     checkCudaErrors(cudaHostAlloc((void **)&h_C, memSize, cudaHostAllocMapped));
133 | 
134 |     // Verify that allocations succeeded
135 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
136 |         fprintf(stderr, "Failed to allocate host vectors!\n");
137 |         exit(EXIT_FAILURE);
138 |     }
139 |     // Get the device pointers for the pinned CPU memory mapped into the GPU memory space.
140 |     checkCudaErrors(cudaHostGetDevicePointer(&map_A, h_A, 0));
141 |     checkCudaErrors(cudaHostGetDevicePointer(&map_B, h_B, 0));
142 |     checkCudaErrors(cudaHostGetDevicePointer(&map_C, h_C, 0));
143 | 
144 |     // Initialize the host input vectors
145 |     for (int i = 0; i < numElements; ++i) {
146 |         h_A[i] = rand() / (float)RAND_MAX;
147 |         h_B[i] = rand() / (float)RAND_MAX;
148 |     }
149 | 
150 |     // Copy the host input vectors A and B in host memory to the device input vectors in device memory
151 |     for (unsigned int i = 0; i < iterNum; i++) {
152 |         sdkStartTimer(&timer);
153 |         vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(map_A, map_B, map_C, numElements);
154 |         checkCudaErrors(cudaGetLastError());
155 |         // Copy the device result vector in device memory to the host result vector in host memory.
156 |         sdkStopTimer(&timer);
157 |         elapsedTimeInMs += sdkGetTimerValue(&timer);
158 |         sdkResetTimer(&timer);
159 |     }
160 | 
161 |     checkCudaErrors(cudaDeviceSynchronize());
162 |     // Verify that the result vector is correct
163 |     for (int i = 0; i < numElements; ++i) {
164 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
165 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
166 |             exit(EXIT_FAILURE);
167 |         }
168 |     }
169 | 
170 |     // calculate throughput in GB/s. Note: use 1000(not 1024)unit.
171 |     double time_s = elapsedTimeInMs / 1e3;
172 |     throughputInGBs = (memSize * (float)iterNum) / (double)1e9;
173 |     throughputInGBs = throughputInGBs / time_s;
174 |     sdkDeleteTimer(&timer);
175 | 
176 |     // Free host memory
177 |     checkCudaErrors(cudaFreeHost(h_A));
178 |     checkCudaErrors(cudaFreeHost(h_B));
179 |     checkCudaErrors(cudaFreeHost(h_C));
180 | 
181 |     return throughputInGBs;
182 | }
183 | 
184 | int main(int argc, char **argv)
185 | {
186 |     printf("[Zero Copy Opt Vector Add] - Starting...\n");
187 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
188 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
189 |         printf("      -size=The size of numElements for testing in bytes. Default: 5000000)\n");
190 |         printf("      -iter=n Iteration numbers of trans. Default:1 \n");
191 |         exit(EXIT_SUCCESS);
192 |     }
193 |     unsigned int numElements = 5000000;
194 |     unsigned int iterNumbers = 1;
195 |     unsigned int gpuID = 0;
196 | 
197 |     if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
198 |         gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
199 |     }
200 |     if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
201 |         numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size");
202 |     }
203 | 
204 |     if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
205 |         iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
206 |     }
207 | 
208 |     checkCudaErrors(cudaSetDevice(gpuID));
209 |     cudaDeviceProp prop;
210 |     cudaGetDeviceProperties(&prop, gpuID);
211 |     if (!prop.canMapHostMemory)
212 |         exit(EXIT_FAILURE);
213 |     printf(">. Data tranfer via global memory.  VectorAdd throughput: %f GB/s\n",
214 |            vectorAddViaGlobalMemory(numElements, iterNumbers));
215 |     printf(">. Data tranfer via  zero copy.     VectorAdd throughput: %f GB/s\n",
216 |            vectorAddViaZeroCopy(numElements, iterNumbers));
217 | 
218 |     exit(EXIT_SUCCESS);
219 | }


--------------------------------------------------------------------------------
/matrix_multiply/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | CUDA_PATH ?= /usr/local/cuda
  4 | 
  5 | 
  6 | # architecture
  7 | HOST_ARCH   := $(shell uname -m)
  8 | TARGET_ARCH ?= $(HOST_ARCH)
  9 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
 10 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 11 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
 12 |             TARGET_SIZE := 64
 13 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 14 |             TARGET_SIZE := 32
 15 |         endif
 16 |     else
 17 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 18 |     endif
 19 | else
 20 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 21 | endif
 22 | 
 23 | # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 24 | ifeq ($(HOST_ARCH),aarch64)
 25 |     ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
 26 |         HOST_ARCH := sbsa
 27 |         TARGET_ARCH := sbsa
 28 |     endif
 29 | endif
 30 | 
 31 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 32 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
 33 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 34 |     endif
 35 | endif
 36 | 
 37 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 38 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
 39 |     TARGET_ARCH = armv7l
 40 | endif
 41 | 
 42 | # operating system
 43 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 44 | TARGET_OS ?= $(HOST_OS)
 45 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
 46 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 47 | endif
 48 | 
 49 | # host compiler
 50 | ifeq ($(TARGET_OS),darwin)
 51 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 52 |         HOST_COMPILER ?= clang++
 53 |     endif
 54 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 55 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 56 |         ifeq ($(TARGET_OS),linux)
 57 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 58 |         else ifeq ($(TARGET_OS),qnx)
 59 |             ifeq ($(QNX_HOST),)
 60 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 61 |             endif
 62 |             ifeq ($(QNX_TARGET),)
 63 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 64 |             endif
 65 |             export QNX_HOST
 66 |             export QNX_TARGET
 67 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 68 |         else ifeq ($(TARGET_OS),android)
 69 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 70 |         endif
 71 |     else ifeq ($(TARGET_ARCH),aarch64)
 72 |         ifeq ($(TARGET_OS), linux)
 73 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 74 |         else ifeq ($(TARGET_OS),qnx)
 75 |             ifeq ($(QNX_HOST),)
 76 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 77 |             endif
 78 |             ifeq ($(QNX_TARGET),)
 79 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 80 |             endif
 81 |             export QNX_HOST
 82 |             export QNX_TARGET
 83 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
 84 |         else ifeq ($(TARGET_OS), android)
 85 |             HOST_COMPILER ?= aarch64-linux-android-clang++
 86 |         endif
 87 |     else ifeq ($(TARGET_ARCH),sbsa)
 88 |         HOST_COMPILER ?= aarch64-linux-gnu-g++
 89 |     else ifeq ($(TARGET_ARCH),ppc64le)
 90 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 91 |     endif
 92 | endif
 93 | HOST_COMPILER ?= g++
 94 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 95 | 
 96 | # internal flags
 97 | NVCCFLAGS   := -m${TARGET_SIZE}
 98 | CCFLAGS     :=
 99 | LDFLAGS     :=
100 | 
101 | # build flags
102 | ifeq ($(TARGET_OS),darwin)
103 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
104 |     CCFLAGS += -arch $(HOST_ARCH)
105 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
106 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
107 |     CCFLAGS += -mfloat-abi=hard
108 | else ifeq ($(TARGET_OS),android)
109 |     LDFLAGS += -pie
110 |     CCFLAGS += -fpie -fpic -fexceptions
111 | endif
112 | 
113 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
114 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
115 |         ifneq ($(TARGET_FS),)
116 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
117 |             ifeq ($(GCCVERSIONLTEQ46),1)
118 |                 CCFLAGS += --sysroot=$(TARGET_FS)
119 |             endif
120 |             LDFLAGS += --sysroot=$(TARGET_FS)
121 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
122 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
123 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
124 |         endif
125 |     endif
126 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
127 |         ifneq ($(TARGET_FS),)
128 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
129 |             ifeq ($(GCCVERSIONLTEQ46),1)
130 |                 CCFLAGS += --sysroot=$(TARGET_FS)
131 |             endif
132 |             LDFLAGS += --sysroot=$(TARGET_FS)
133 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
134 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
135 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
136 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
137 |             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
138 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
139 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
140 |         endif
141 |     endif
142 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
143 |         NVCCFLAGS += -D_QNX_SOURCE
144 |         NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
145 |         CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
146 |         LDFLAGS += -lsocket
147 |         LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
148 |         CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
149 |         ifdef TARGET_OVERRIDE
150 |             LDFLAGS += -lslog2
151 |         endif
152 | 
153 |         ifneq ($(TARGET_FS),)
154 |             LDFLAGS += -L$(TARGET_FS)/usr/lib
155 |             CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
156 |             LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
157 |             CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
158 |             CCFLAGS += -I$(TARGET_FS)/../include
159 |         endif
160 |     endif
161 | endif
162 | 
163 | ifdef TARGET_OVERRIDE # cuda toolkit targets override
164 |     NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
165 | endif
166 | 
167 | # Debug build flags
168 | ifeq ($(dbg),1)
169 |       NVCCFLAGS += -g -G
170 |       BUILD_TYPE := debug
171 | else
172 |       BUILD_TYPE := release
173 | endif
174 | 
175 | ALL_CCFLAGS :=
176 | ALL_CCFLAGS += $(NVCCFLAGS)
177 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
178 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
179 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
180 | 
181 | SAMPLE_ENABLED := 1
182 | 
183 | ALL_LDFLAGS :=
184 | ALL_LDFLAGS += $(ALL_CCFLAGS)
185 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
186 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
187 | 
188 | # Common includes and paths for CUDA
189 | INCLUDES  := -I./
190 | LIBRARIES :=
191 | 
192 | ################################################################################
193 | 
194 | # Gencode arguments
195 | ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
196 | SMS ?= 53 61 70 72 75 80 86 87
197 | else
198 | SMS ?= 50 52 60 61 70 75
199 | endif
200 | 
201 | ifeq ($(SMS),)
202 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
203 | SAMPLE_ENABLED := 0
204 | endif
205 | 
206 | ifeq ($(GENCODE_FLAGS),)
207 | # Generate SASS code for each SM architecture listed in $(SMS)
208 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
209 | 
210 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
211 | HIGHEST_SM := $(lastword $(sort $(SMS)))
212 | ifneq ($(HIGHEST_SM),)
213 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
214 | endif
215 | endif
216 | 
217 | ALL_CCFLAGS += --threads 0 --std=c++11
218 | LIBRARIES += -lcublas
219 | 
220 | ifeq ($(SAMPLE_ENABLED),0)
221 | EXEC ?= @echo "[@]"
222 | endif
223 | 
224 | ################################################################################
225 | 
226 | all: matMul
227 | 
228 | check.deps:
229 | ifeq ($(SAMPLE_ENABLED),0)
230 | 	@echo "Sample will be waived due to the above missing dependencies"
231 | else
232 | 	@echo "Sample is ready - all dependencies have been met"
233 | endif
234 | 
235 | matMul: matMul1DKernel.cu matMul2DKernel.cu matMulCublasKernel.cu testMatMul.cu
236 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
237 | 
238 | clean:
239 | 	rm -f matMul matMul.o
240 | 
241 | clobber: clean
242 | 


--------------------------------------------------------------------------------
/matrix_multiply/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/README.md


--------------------------------------------------------------------------------
/matrix_multiply/imgs/2d_block_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/2d_block_split.png


--------------------------------------------------------------------------------
/matrix_multiply/imgs/extended_cases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/extended_cases.png


--------------------------------------------------------------------------------
/matrix_multiply/imgs/info.txt:
--------------------------------------------------------------------------------
1 | Show images.
2 | 


--------------------------------------------------------------------------------
/matrix_multiply/imgs/matmul_use_shm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/matmul_use_shm.png


--------------------------------------------------------------------------------
/matrix_multiply/imgs/matrix_in_mem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/matrix_in_mem.png


--------------------------------------------------------------------------------
/matrix_multiply/imgs/perf_v100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/perf_v100.png


--------------------------------------------------------------------------------
/matrix_multiply/imgs/sub_matrix_mul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/matrix_multiply/imgs/sub_matrix_mul.png


--------------------------------------------------------------------------------
/matrix_multiply/matMul.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | // System includes
  3 | #include <assert.h>
  4 | #include <stdio.h>
  5 | 
  6 | // CUDA runtime
  7 | #include <cuda_profiler_api.h>
  8 | #include <cuda_runtime.h>
  9 | 
 10 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 11 | #define STRCASECMP _stricmp
 12 | #define STRNCASECMP _strnicmp
 13 | #else
 14 | #define STRCASECMP strcasecmp
 15 | #define STRNCASECMP strncasecmp
 16 | #endif
 17 | 
 18 | #define checkCuBLASErrors(status) \
 19 |     do { \
 20 |         if (status != CUBLAS_STATUS_SUCCESS) { \
 21 |             fprintf(stderr, "CUBLAS error: %d at %s:%d\n", status, __FILE__, __LINE__); \
 22 |             exit(EXIT_FAILURE); \
 23 |         } \
 24 |     } while (0)
 25 | 
 26 | template <typename T> void check(T result, char const *const func, const char *const file, int const line)
 27 | {
 28 |     if (result) {
 29 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
 30 |                 cudaGetErrorString(result), func);
 31 |         exit(EXIT_FAILURE);
 32 |     }
 33 | }
 34 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 35 | 
 36 | inline int stringRemoveDelimiter(char delimiter, const char *string)
 37 | {
 38 |     int string_start = 0;
 39 | 
 40 |     while (string[string_start] == delimiter) {
 41 |         string_start++;
 42 |     }
 43 | 
 44 |     if (string_start >= static_cast<int>(strlen(string) - 1)) {
 45 |         return 0;
 46 |     }
 47 | 
 48 |     return string_start;
 49 | }
 50 | 
 51 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
 52 | {
 53 |     bool bFound = false;
 54 | 
 55 |     if (argc >= 1) {
 56 |         for (int i = 1; i < argc; i++) {
 57 |             int string_start = stringRemoveDelimiter('-', argv[i]);
 58 |             const char *string_argv = &argv[i][string_start];
 59 | 
 60 |             const char *equal_pos = strchr(string_argv, '=');
 61 |             int argv_length = static_cast<int>(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
 62 | 
 63 |             int length = static_cast<int>(strlen(string_ref));
 64 | 
 65 |             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) {
 66 |                 bFound = true;
 67 |                 continue;
 68 |             }
 69 |         }
 70 |     }
 71 | 
 72 |     return bFound;
 73 | }
 74 | 
 75 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
 76 | {
 77 |     bool bFound = false;
 78 |     int value = -1;
 79 | 
 80 |     if (argc >= 1) {
 81 |         for (int i = 1; i < argc; i++) {
 82 |             int string_start = stringRemoveDelimiter('-', argv[i]);
 83 |             const char *string_argv = &argv[i][string_start];
 84 |             int length = static_cast<int>(strlen(string_ref));
 85 | 
 86 |             if (!STRNCASECMP(string_argv, string_ref, length)) {
 87 |                 if (length + 1 <= static_cast<int>(strlen(string_argv))) {
 88 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
 89 |                     value = atoi(&string_argv[length + auto_inc]);
 90 |                 } else {
 91 |                     value = 0;
 92 |                 }
 93 | 
 94 |                 bFound = true;
 95 |                 continue;
 96 |             }
 97 |         }
 98 |     }
 99 | 
100 |     if (bFound) {
101 |         return value;
102 |     } else {
103 |         return 0;
104 |     }
105 | }
106 | 
107 | inline void ConstantInit(float *data, int size, float val)
108 | {
109 |     for (int i = 0; i < size; ++i) {
110 |         data[i] = val;
111 |     }
112 | }
113 | 
114 | inline bool ResultCheck(float *h_C,int sizeC, int wA, const float valB) {
115 |     printf("Checking computed result for correctness: ");
116 |     bool correct = true;
117 |     
118 |     // test relative error by the formula
119 |     //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
120 |     double eps = 1.e-6; // machine zero
121 | 
122 |     for (int i = 0; i < sizeC; i++) {
123 |         double abs_err = fabs(h_C[i] - (wA* valB));
124 |         double dot_length = wA;
125 |         double abs_val = fabs(h_C[i]);
126 |         double rel_err = abs_err / abs_val / dot_length;
127 | 
128 |         if (rel_err > eps) {
129 |             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], wA * valB, eps);
130 |             correct = false;
131 |         }
132 |     }
133 | 
134 |     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
135 |     return correct;
136 | }
137 | 
138 | int MatrixMul1DTest(int argc, char **argv, int threadSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB,
139 |                     bool useShMem);
140 | 
141 | int MatMul2DTest(int argc, char **argv, int thblockSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB,
142 |                  bool useAnySize);
143 | 
144 | int MatMulCublasTest(int argc, char **argv, int blockSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB);
145 | 


--------------------------------------------------------------------------------
/matrix_multiply/matMul1DKernel.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "matMul.h"
  3 | 
  4 | __global__ void MatMulKernel1D(float *C, float *A, float *B, const int wh, const int wC, const int hC)
  5 | {
  6 |     const int totalSize = wC * hC;
  7 |     int thID = threadIdx.x + blockIdx.x * blockDim.x;
  8 |     while (thID < totalSize) {
  9 |         int Cx = thID / wC;
 10 |         int Cy = thID % wC;
 11 |         float rst = 0.0;
 12 |         for (int i = 0; i < wh; i++) {
 13 |             rst += A[Cx * wh + i] * B[i * wC + Cy];
 14 |         }
 15 |         C[Cx * wC + Cy] = rst;
 16 |         thID += gridDim.x * blockDim.x;
 17 |     }
 18 |     __syncthreads();
 19 | }
 20 | 
 21 | template <int shWASize>
 22 | __global__ void MatMulKernel1DWithShMem(float *C, float *A, float *B, const int wA, const int wC, const int hC)
 23 | {
 24 |     __shared__ float sRow[shWASize]; // shared wA
 25 |     int blockID = blockIdx.x;
 26 |     while (blockID < hC) {
 27 |         int thIdx = threadIdx.x;
 28 |         while (thIdx < wA) {
 29 |             sRow[thIdx] = A[blockID * wA + thIdx];
 30 |             thIdx += blockDim.x;
 31 |         }
 32 |         __syncthreads();
 33 | 
 34 |         thIdx = threadIdx.x;
 35 |         while (thIdx < wC) { // wB = wC;
 36 |             float sum = 0.0;
 37 |             for (int i = 0; i < wA; i++) {
 38 |                 sum += sRow[i] * B[wC * i + thIdx];
 39 |             }
 40 |             C[blockID * wC + thIdx] = sum;
 41 |             thIdx += blockDim.x;
 42 |         }
 43 |         blockID += gridDim.x;
 44 |     }
 45 | }
 46 | 
 47 | 
 48 | /*
 49 | * Run a simple test of matrix multiplication with 1D blocks.
 50 | */
 51 | int MatrixMul1DTest(int argc, char **argv, int threadSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB,
 52 |                     bool useShMem)
 53 | {
 54 |     // Allocate host memory for matrices A and B
 55 |     unsigned int size_A = dimsA.x * dimsA.y;
 56 |     unsigned int mem_size_A = sizeof(float) * size_A;
 57 |     float *h_A;
 58 |     checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
 59 |     unsigned int size_B = dimsB.x * dimsB.y;
 60 |     unsigned int mem_size_B = sizeof(float) * size_B;
 61 |     float *h_B;
 62 |     checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
 63 |     cudaStream_t stream;
 64 | 
 65 |     // Initialize host memory
 66 |     const float valB = 0.01f;
 67 |     ConstantInit(h_A, size_A, 1.0f);
 68 |     ConstantInit(h_B, size_B, valB);
 69 | 
 70 |     // Allocate device memory
 71 |     float *d_A, *d_B, *d_C;
 72 | 
 73 |     // Allocate host matrix C
 74 |     dim3 dimsC(dimsB.x, dimsA.y, 1);
 75 |     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 76 |     float *h_C;
 77 |     checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
 78 | 
 79 |     if (h_C == NULL) {
 80 |         fprintf(stderr, "Failed to allocate host matrix C!\n");
 81 |         exit(EXIT_FAILURE);
 82 |     }
 83 | 
 84 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
 85 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
 86 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
 87 |     // Allocate CUDA events that we'll use for timing
 88 |     cudaEvent_t start, stop;
 89 |     checkCudaErrors(cudaEventCreate(&start));
 90 |     checkCudaErrors(cudaEventCreate(&stop));
 91 | 
 92 |     checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 93 | 
 94 |     // copy host memory to device
 95 |     checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
 96 |     checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
 97 | 
 98 |     // Setup execution parameters
 99 |     int grid = dimsC.x * dimsC.y / threadSize;
100 |     // dim3 grid(4, 4);
101 | 
102 |     // Create and start timer
103 |     printf("Computing result using MatrixMul1DTest Shared Mem: %d\n", useShMem);
104 | 
105 |     // select diff shared memory size in blocks;
106 |     void (*MMKernel1DWithShMemExe)(float *C, float *A, float *B, const int wA, const int wC, const int hC);
107 |     if (dimsA.x <= 256) {
108 |         MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<256>;
109 |     } else if (dimsA.x <= 1024) {
110 |         MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<1024>;
111 |     } else if (dimsA.x <= 2048) {
112 |         MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<2048>;
113 |     } else if (dimsA.x <= 4096) {
114 |         MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<4096>;
115 |     } else {
116 |         // shared mem has limitation. Change the size according to your scenarios.
117 |         MMKernel1DWithShMemExe = MatMulKernel1DWithShMem<8192>;
118 |     }
119 | 
120 |     // Performs warmup operation using matrixMul CUDA kernel
121 |     if (useShMem) {
122 |         MMKernel1DWithShMemExe<<<grid, threadSize, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y);
123 |     } else {
124 |         MatMulKernel1D<<<grid, threadSize, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y);
125 |     }
126 |     printf("Warmup  operation done\n");
127 |     checkCudaErrors(cudaStreamSynchronize(stream));
128 | 
129 |     // Record the start event
130 |     checkCudaErrors(cudaEventRecord(start, stream));
131 | 
132 |     // Execute the kernel
133 |     for (int j = 0; j < iterNum; j++) {
134 |         if (useShMem) {
135 |             MMKernel1DWithShMemExe<<<grid, threadSize, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y);
136 |         } else {
137 |             MatMulKernel1D<<<grid, threadSize, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsC.x, dimsC.y);
138 |         }
139 |     }
140 | 
141 |     // Record the stop event
142 |     checkCudaErrors(cudaEventRecord(stop, stream));
143 | 
144 |     // Wait for the stop event to complete
145 |     checkCudaErrors(cudaEventSynchronize(stop));
146 | 
147 |     float msecTotal = 0.0f;
148 |     checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
149 | 
150 |     // Compute and print the performance
151 |     float msecPerMatrixMul = msecTotal / iterNum;
152 |     double flopsPerMatrixMul =
153 |         2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
154 |     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
155 |     printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
156 |            " WorkgroupSize= %u threads/block\n",
157 |            gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threadSize);
158 | 
159 |     // Copy result from device to host
160 |     checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
161 |     checkCudaErrors(cudaStreamSynchronize(stream));
162 | 
163 |    bool ret = ResultCheck(h_C, static_cast<int>(dimsC.x * dimsC.y), dimsA.x, valB);
164 | 
165 |     // Clean up memory
166 |     checkCudaErrors(cudaFreeHost(h_A));
167 |     checkCudaErrors(cudaFreeHost(h_B));
168 |     checkCudaErrors(cudaFreeHost(h_C));
169 |     checkCudaErrors(cudaFree(d_A));
170 |     checkCudaErrors(cudaFree(d_B));
171 |     checkCudaErrors(cudaFree(d_C));
172 |     checkCudaErrors(cudaEventDestroy(start));
173 |     checkCudaErrors(cudaEventDestroy(stop));
174 |     checkCudaErrors(cudaStreamDestroy(stream));
175 | 
176 |     if (ret) {
177 |         return EXIT_SUCCESS;
178 |     } else {
179 |         return EXIT_FAILURE;
180 |     }
181 | }
182 | 


--------------------------------------------------------------------------------
/matrix_multiply/matMulCublasKernel.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "matMul.h"
  3 | #include <cublas_v2.h>
  4 | 
  5 | /**
  6 |  * Run a simple test of matrix multiplication using CUBLAS Sgemm.
  7 |  */
  8 | int MatMulCublasTest(int argc, char **argv, int blockSize, int iterNum, const dim3 &dimsA, const dim3 &dimsB)
  9 | {
 10 |     // Allocate host memory for matrices A and B
 11 |     unsigned int size_A = dimsA.x * dimsA.y;
 12 |     unsigned int mem_size_A = sizeof(float) * size_A;
 13 |     float *h_A;
 14 |     checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
 15 |     unsigned int size_B = dimsB.x * dimsB.y;
 16 |     unsigned int mem_size_B = sizeof(float) * size_B;
 17 |     float *h_B;
 18 |     checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
 19 |     cudaStream_t stream;
 20 | 
 21 |     // Initialize host memory
 22 |     const float valB = 0.01f;
 23 |     ConstantInit(h_A, size_A, 1.0f);
 24 |     ConstantInit(h_B, size_B, valB);
 25 | 
 26 |     // Allocate device memory
 27 |     float *d_A, *d_B, *d_C;
 28 | 
 29 |     // Allocate host matrix C
 30 |     dim3 dimsC(dimsB.x, dimsA.y, 1);
 31 |     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
 32 |     float *h_C;
 33 |     checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
 34 | 
 35 |     if (h_C == NULL) {
 36 |         fprintf(stderr, "Failed to allocate host matrix C!\n");
 37 |         exit(EXIT_FAILURE);
 38 |     }
 39 | 
 40 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
 41 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
 42 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
 43 |     // Allocate CUDA events that we'll use for timing
 44 |     cudaEvent_t start, stop;
 45 |     checkCudaErrors(cudaEventCreate(&start));
 46 |     checkCudaErrors(cudaEventCreate(&stop));
 47 | 
 48 |     checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 49 | 
 50 |     // copy host memory to device
 51 |     checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
 52 |     checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
 53 | 
 54 |     const float alpha = 1.0f;
 55 |     const float beta = 0.0f;
 56 |     cublasHandle_t handle;
 57 |     checkCuBLASErrors(cublasCreate(&handle));
 58 | 
 59 |     // Create and start timer
 60 |     printf("Computing result using CUBLAS Sgemmm Kernel. \n");
 61 |     checkCuBLASErrors(cublasSgemm(
 62 |         handle, CUBLAS_OP_N, CUBLAS_OP_N, dimsB.x, dimsA.y,
 63 |         dimsA.x, &alpha, d_B, dimsB.x, d_A,
 64 |         dimsA.x, &beta, d_C, dimsB.x));
 65 | 
 66 |     printf("Warmup  operation done\n");
 67 |     checkCudaErrors(cudaStreamSynchronize(stream));
 68 | 
 69 |     // Record the start event
 70 |     checkCudaErrors(cudaEventRecord(start, stream));
 71 | 
 72 |     // Execute the kernel
 73 |     for (int j = 0; j < iterNum; j++) {
 74 |       // note cublas is column primary!
 75 |       // need to transpose the order
 76 |       checkCuBLASErrors(cublasSgemm(
 77 |           handle, CUBLAS_OP_N, CUBLAS_OP_N, dimsB.x, dimsA.y,
 78 |           dimsA.x, &alpha, d_B, dimsB.x, d_A,
 79 |           dimsA.x, &beta, d_C, dimsB.x));
 80 |     }
 81 | 
 82 |     // Record the stop event
 83 |     checkCudaErrors(cudaEventRecord(stop, stream));
 84 | 
 85 |     // Wait for the stop event to complete
 86 |     checkCudaErrors(cudaEventSynchronize(stop));
 87 | 
 88 |     float msecTotal = 0.0f;
 89 |     checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
 90 | 
 91 |     // Compute and print the performance
 92 |     float msecPerMatrixMul = msecTotal / iterNum;
 93 |     double flopsPerMatrixMul =
 94 |         2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
 95 |     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
 96 |     printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,",
 97 |            gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
 98 | 
 99 |     // Copy result from device to host
100 |     checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
101 |     checkCudaErrors(cudaStreamSynchronize(stream));
102 |     checkCuBLASErrors(cublasDestroy(handle));
103 | 
104 |     bool ret = ResultCheck(h_C, static_cast<int>(dimsC.x * dimsC.y), dimsA.x, valB);
105 | 
106 |     // Clean up memory
107 |     checkCudaErrors(cudaFreeHost(h_A));
108 |     checkCudaErrors(cudaFreeHost(h_B));
109 |     checkCudaErrors(cudaFreeHost(h_C));
110 |     checkCudaErrors(cudaFree(d_A));
111 |     checkCudaErrors(cudaFree(d_B));
112 |     checkCudaErrors(cudaFree(d_C));
113 |     checkCudaErrors(cudaEventDestroy(start));
114 |     checkCudaErrors(cudaEventDestroy(stop));
115 |     checkCudaErrors(cudaStreamDestroy(stream));
116 | 
117 |     if (ret) {
118 |         return EXIT_SUCCESS;
119 |     } else {
120 |         return EXIT_FAILURE;
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/matrix_multiply/testMatMul.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Test different version of matrix multiply.
  3 |  *   A x B   A[hA, wA] B[hB, wB]
  4 |  * e.g. ./matMul wA=1000 hA=312 wB=11 hB=1000
  5 |  *
  6 |  *  This demo code might be stale with the development of CUDA. 
  7 |  *  To use the latest API operations, you could see NVIDIA guide: 
  8 |  *      https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
  9 |  *     https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY 
 10 |  * Author: kevin.xie
 11 |  * Email: kaiyuanxie@yeah.net
 12 |  */
 13 | 
 14 | #include "matMul.h"
 15 | 
 16 | enum ALGO_TYPE {
 17 |     DEFAULT_MODEL,
 18 |     MatMul_1D_KERENL,
 19 |     MatMul_1D_KERNEL_WITH_SHARED_MEMORY,
 20 |     MatMul_2D_KERENEL_BLOCK_MULTIPLES_SIZE,
 21 |     MatMul_2D_KERNEL_ANY_SIZE,
 22 |     MatMul_CUBLAS_SGEMM_KERNEL,
 23 | };
 24 | 
 25 | /**
 26 |  * Program main
 27 |  */
 28 | 
 29 | void checkResult(int ret)
 30 | {
 31 |     if (ret != EXIT_SUCCESS) {
 32 |         checkCudaErrors(cudaProfilerStop());
 33 |         exit(ret);
 34 |     }
 35 | }
 36 | 
 37 | int main(int argc, char **argv)
 38 | {
 39 |     printf("[Matrix Multiply Test] - Starting...\n");
 40 |     printf("\nNOTE: The CUDA Samples are not meant for performance "
 41 |            "measurements. Results may vary when GPU Boost is enabled.\n");
 42 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
 43 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
 44 |         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
 45 |         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
 46 |         printf("      -iter=n Iteration numbers of algorithm. Default:500 \n");
 47 |         printf("      -algo=[0|1|2|3|4|5] 0: Test all, 1: MatMul_1D_KERENL, 2:MatMul_1D_KERNEL_WITH_SHARED_MEMORY, "
 48 |                "3: MatMul_2D_KERENEL_BLOCK_MULTIPLES_SIZE, 4: MatMul_2D_KERNEL_ANY_SIZE, 5:MatMul_CUBLAS_SGEMM_KERNEL\n");
 49 |         printf("Note: Outer matrix dimensions of A & B matrices"
 50 |                " must be equal.\n");
 51 | 
 52 |         exit(EXIT_SUCCESS);
 53 |     }
 54 | 
 55 |     // int dev = 0;
 56 |     int blockSize = 32;
 57 |     int threadsPerBlock = blockSize * blockSize;
 58 | 
 59 |     // select algorithem:
 60 |     int algo = 0;
 61 |     int iterationNum = 500;
 62 | 
 63 |     // example case:
 64 |     dim3 dimsA(5 * 2 * blockSize, 5 * 2 * blockSize, 1);
 65 |     dim3 dimsB(5 * 4 * blockSize, 5 * 2 * blockSize, 1);
 66 | 
 67 |     // width of Matrix A
 68 |     if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
 69 |         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
 70 |     }
 71 | 
 72 |     // height of Matrix A
 73 |     if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
 74 |         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
 75 |     }
 76 | 
 77 |     // width of Matrix B
 78 |     if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
 79 |         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
 80 |     }
 81 | 
 82 |     // height of Matrix B
 83 |     if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
 84 |         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
 85 |     }
 86 | 
 87 |     if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
 88 |         iterationNum = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
 89 |     }
 90 | 
 91 |     if (checkCmdLineFlag(argc, (const char **)argv, "algo")) {
 92 |         algo = getCmdLineArgumentInt(argc, (const char **)argv, "algo");
 93 |     }
 94 | 
 95 |     if (dimsA.x != dimsB.y) {
 96 |         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
 97 |         exit(EXIT_FAILURE);
 98 |     }
 99 | 
100 |     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
101 | 
102 |     // int matrix_result = MatrixMul1DTest(argc, argv, 256, iterationNum, dimsA, dimsB, false);
103 |     checkCudaErrors(cudaProfilerStart());
104 |     switch (algo) {
105 |         case MatMul_1D_KERENL:
106 |             checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, false));
107 |             break;
108 |         case MatMul_1D_KERNEL_WITH_SHARED_MEMORY:
109 |             checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, true));
110 |             break;
111 |         case MatMul_2D_KERENEL_BLOCK_MULTIPLES_SIZE:
112 |             if (dimsA.x % blockSize != 0) {
113 |                 printf("dim of wA must be divided by blockSize: %d\n", blockSize);
114 |                 exit(EXIT_FAILURE);
115 |             }
116 |             checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, false));
117 |             break;
118 |         case MatMul_2D_KERNEL_ANY_SIZE:
119 |             checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, true));
120 |             break;
121 |         case MatMul_CUBLAS_SGEMM_KERNEL:
122 |             checkResult(MatMulCublasTest(argc, argv, blockSize, iterationNum, dimsA, dimsB));
123 |             break;
124 |         default:
125 |             printf("========================= 1D blocks without shared memory =================\n");
126 |             checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, false));
127 |             printf("========================= 1D blocks with shared memory ===================\n");
128 |             checkResult(MatrixMul1DTest(argc, argv, threadsPerBlock, iterationNum, dimsA, dimsB, true));
129 |             if (dimsA.x % blockSize == 0) {
130 |                 printf("========================= 2D blocks with block multiples size =============\n");
131 |                 checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, false));
132 |             }
133 |             printf("========================= 2D blocks with any size ========================\n");
134 |             checkResult(MatMul2DTest(argc, argv, blockSize, iterationNum, dimsA, dimsB, true));
135 |             printf("========================= CUBLAS Sgemm kernel ========================\n");
136 |             checkResult(MatMulCublasTest(argc, argv, blockSize, iterationNum, dimsA, dimsB));
137 |             break;
138 |     }
139 | 
140 |     checkCudaErrors(cudaProfilerStop());
141 |     exit(EXIT_SUCCESS);
142 | }
143 | 


--------------------------------------------------------------------------------
/memory_opt/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | CUDA_PATH ?= /usr/local/cuda
  4 | 
  5 | 
  6 | # architecture
  7 | HOST_ARCH   := $(shell uname -m)
  8 | TARGET_ARCH ?= $(HOST_ARCH)
  9 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
 10 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 11 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
 12 |             TARGET_SIZE := 64
 13 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 14 |             TARGET_SIZE := 32
 15 |         endif
 16 |     else
 17 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 18 |     endif
 19 | else
 20 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 21 | endif
 22 | 
 23 | # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 24 | ifeq ($(HOST_ARCH),aarch64)
 25 |     ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
 26 |         HOST_ARCH := sbsa
 27 |         TARGET_ARCH := sbsa
 28 |     endif
 29 | endif
 30 | 
 31 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 32 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
 33 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 34 |     endif
 35 | endif
 36 | 
 37 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 38 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
 39 |     TARGET_ARCH = armv7l
 40 | endif
 41 | 
 42 | # operating system
 43 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 44 | TARGET_OS ?= $(HOST_OS)
 45 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
 46 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 47 | endif
 48 | 
 49 | # host compiler
 50 | ifeq ($(TARGET_OS),darwin)
 51 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 52 |         HOST_COMPILER ?= clang++
 53 |     endif
 54 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 55 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 56 |         ifeq ($(TARGET_OS),linux)
 57 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 58 |         else ifeq ($(TARGET_OS),qnx)
 59 |             ifeq ($(QNX_HOST),)
 60 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 61 |             endif
 62 |             ifeq ($(QNX_TARGET),)
 63 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 64 |             endif
 65 |             export QNX_HOST
 66 |             export QNX_TARGET
 67 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 68 |         else ifeq ($(TARGET_OS),android)
 69 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 70 |         endif
 71 |     else ifeq ($(TARGET_ARCH),aarch64)
 72 |         ifeq ($(TARGET_OS), linux)
 73 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 74 |         else ifeq ($(TARGET_OS),qnx)
 75 |             ifeq ($(QNX_HOST),)
 76 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 77 |             endif
 78 |             ifeq ($(QNX_TARGET),)
 79 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 80 |             endif
 81 |             export QNX_HOST
 82 |             export QNX_TARGET
 83 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
 84 |         else ifeq ($(TARGET_OS), android)
 85 |             HOST_COMPILER ?= aarch64-linux-android-clang++
 86 |         endif
 87 |     else ifeq ($(TARGET_ARCH),sbsa)
 88 |         HOST_COMPILER ?= aarch64-linux-gnu-g++
 89 |     else ifeq ($(TARGET_ARCH),ppc64le)
 90 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 91 |     endif
 92 | endif
 93 | HOST_COMPILER ?= g++
 94 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 95 | 
 96 | # internal flags
 97 | NVCCFLAGS   := -m${TARGET_SIZE}
 98 | CCFLAGS     :=
 99 | LDFLAGS     :=
100 | 
101 | # build flags
102 | ifeq ($(TARGET_OS),darwin)
103 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
104 |     CCFLAGS += -arch $(HOST_ARCH)
105 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
106 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
107 |     CCFLAGS += -mfloat-abi=hard
108 | else ifeq ($(TARGET_OS),android)
109 |     LDFLAGS += -pie
110 |     CCFLAGS += -fpie -fpic -fexceptions
111 | endif
112 | 
113 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
114 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
115 |         ifneq ($(TARGET_FS),)
116 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
117 |             ifeq ($(GCCVERSIONLTEQ46),1)
118 |                 CCFLAGS += --sysroot=$(TARGET_FS)
119 |             endif
120 |             LDFLAGS += --sysroot=$(TARGET_FS)
121 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
122 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
123 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
124 |         endif
125 |     endif
126 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
127 |         ifneq ($(TARGET_FS),)
128 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
129 |             ifeq ($(GCCVERSIONLTEQ46),1)
130 |                 CCFLAGS += --sysroot=$(TARGET_FS)
131 |             endif
132 |             LDFLAGS += --sysroot=$(TARGET_FS)
133 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
134 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
135 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
136 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
137 |             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
138 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
139 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
140 |         endif
141 |     endif
142 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
143 |         NVCCFLAGS += -D_QNX_SOURCE
144 |         NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
145 |         CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
146 |         LDFLAGS += -lsocket
147 |         LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
148 |         CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
149 |         ifdef TARGET_OVERRIDE
150 |             LDFLAGS += -lslog2
151 |         endif
152 | 
153 |         ifneq ($(TARGET_FS),)
154 |             LDFLAGS += -L$(TARGET_FS)/usr/lib
155 |             CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
156 |             LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
157 |             CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
158 |             CCFLAGS += -I$(TARGET_FS)/../include
159 |         endif
160 |     endif
161 | endif
162 | 
163 | ifdef TARGET_OVERRIDE # cuda toolkit targets override
164 |     NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
165 | endif
166 | 
167 | # Debug build flags
168 | ifeq ($(dbg),1)
169 |       NVCCFLAGS += -g -G
170 |       BUILD_TYPE := debug
171 | else
172 |       BUILD_TYPE := release
173 | endif
174 | 
175 | ALL_CCFLAGS :=
176 | ALL_CCFLAGS += $(NVCCFLAGS)
177 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
178 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
179 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
180 | 
181 | SAMPLE_ENABLED := 1
182 | 
183 | ALL_LDFLAGS :=
184 | ALL_LDFLAGS += $(ALL_CCFLAGS)
185 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
186 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
187 | 
188 | # Common includes and paths for CUDA
189 | INCLUDES  := -I./
190 | LIBRARIES :=
191 | 
192 | ################################################################################
193 | 
194 | # Gencode arguments
195 | ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
196 | SMS ?= 53 61 70 72 75 80 86 87
197 | else
198 | SMS ?= 35 37 50 52 60 61 70 75
199 | endif
200 | 
201 | ifeq ($(SMS),)
202 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
203 | SAMPLE_ENABLED := 0
204 | endif
205 | 
206 | ifeq ($(GENCODE_FLAGS),)
207 | # Generate SASS code for each SM architecture listed in $(SMS)
208 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
209 | 
210 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
211 | HIGHEST_SM := $(lastword $(sort $(SMS)))
212 | ifneq ($(HIGHEST_SM),)
213 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
214 | endif
215 | endif
216 | 
217 | ALL_CCFLAGS += --threads 0 --std=c++11
218 | LIBRARIES += -lcublas
219 | 
220 | ifeq ($(SAMPLE_ENABLED),0)
221 | EXEC ?= @echo "[@]"
222 | endif
223 | 
224 | ################################################################################
225 | 
226 | all: testHost2Device testDevice2Device testSharedMemory testZeroCopy
227 | 
228 | check.deps:
229 | ifeq ($(SAMPLE_ENABLED),0)
230 | 	@echo "Sample will be waived due to the above missing dependencies"
231 | else
232 | 	@echo "Sample is ready - all dependencies have been met"
233 | endif
234 | 
235 | testHost2Device: hostAndDeviceTrans.cu
236 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
237 | 
238 | testDevice2Device: device2Device.cu
239 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
240 | 
241 | testSharedMemory: sharedMemory.cu
242 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
243 | 
244 | testZeroCopy: zeroCopy.cu
245 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)   
246 | 
247 | clean:
248 | 	rm -f testHost2Device testDevice2Device testSharedMemory testZeroCopy
249 | 	
250 | clobber: clean
251 | 


--------------------------------------------------------------------------------
/memory_opt/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/memory_opt/README.md


--------------------------------------------------------------------------------
/memory_opt/device2Device.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Memory transfer between device and device example to help you understand the process.
  3 |  *
  4 |  *  This demo code might be stale with the development of CUDA.
  5 |  *  To use the latest API operations, you could see NVIDIA guide:
  6 |  *      https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
  7 |  *      https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
  8 |  *
  9 |  *  Author: kevin.xie
 10 |  *  Email: kaiyuanxie@yeah.net
 11 |  */
 12 | 
 13 | #include <cuda.h>
 14 | #include <cassert>
 15 | #include <memory>
 16 | 
 17 | #include "memoryOpt.h"
 18 | #include "timer.h"
 19 | 
 20 | /*
 21 |  *  transfer data in device itself.
 22 |  */
 23 | float deviceToItself(const unsigned int memSize, const unsigned int iterNum)
 24 | {
 25 | 
 26 |     float elapsedTimeInMs = 0.0f;
 27 |     float bandwidthInGBs = 0.0f;
 28 |     unsigned char *devInData;
 29 |     unsigned char *devOutData;
 30 |     cudaEvent_t start, stop;
 31 |     checkCudaErrors(cudaEventCreate(&start));
 32 |     checkCudaErrors(cudaEventCreate(&stop));
 33 | 
 34 |     // allocate host memory
 35 |     unsigned char *hInData = (unsigned char *)malloc(memSize);
 36 | 
 37 |     if (hInData == 0) {
 38 |         fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
 39 |         exit(EXIT_FAILURE);
 40 |     }
 41 | 
 42 |     for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
 43 |         hInData[i] = (unsigned char)(i & 0xff);
 44 |     }
 45 | 
 46 |    
 47 |     checkCudaErrors(cudaMalloc((void **)&devInData, memSize));
 48 |     checkCudaErrors(cudaMalloc((void **)&devOutData, memSize));
 49 |     checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
 50 |     checkCudaErrors(cudaEventRecord(start, 0));
 51 | 
 52 |     for (unsigned int i = 0; i < iterNum; i++) {
 53 |         checkCudaErrors(cudaMemcpy(devOutData, devInData, memSize, cudaMemcpyDeviceToDevice));
 54 |     }
 55 | 
 56 |     checkCudaErrors(cudaEventRecord(stop, 0));
 57 |     checkCudaErrors(cudaDeviceSynchronize());
 58 |     checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
 59 | 
 60 |     // In and Out, mutilpy 2.0 factor. Note: use 1000(not 1024)unit.
 61 |     double time_s = elapsedTimeInMs / 1e3;
 62 |     bandwidthInGBs = (2.0f * memSize * (float)iterNum) / (double)1e9;
 63 |     bandwidthInGBs = bandwidthInGBs / time_s;
 64 | 
 65 |     free(hInData);
 66 |     checkCudaErrors(cudaEventDestroy(stop));
 67 |     checkCudaErrors(cudaEventDestroy(start));
 68 |     checkCudaErrors(cudaFree(devInData));
 69 |     checkCudaErrors(cudaFree(devOutData));
 70 |     return bandwidthInGBs;
 71 | }
 72 | 
 73 | /*
 74 |  *  transfer data from one devcie to another without peer-to-peer opt.
 75 |  */
 76 | float deviceToDeviceWithoutP2P(const unsigned int memSize, const unsigned int iterNum, const unsigned int GPUA,
 77 |                                const unsigned int GPUB)
 78 | {
 79 | 
 80 |     float elapsedTimeInMs = 0.0f;
 81 |     float bandwidthInGBs = 0.0f;
 82 |     unsigned char *devInData;
 83 |     unsigned char *devOutData;
 84 |     cudaEvent_t start, stop;
 85 |     checkCudaErrors(cudaEventCreate(&start));
 86 |     checkCudaErrors(cudaEventCreate(&stop));
 87 | 
 88 |     // allocate host memory
 89 |     unsigned char *hInData = (unsigned char *)malloc(memSize);
 90 | 
 91 |     if (hInData == 0) {
 92 |         fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
 93 |         exit(EXIT_FAILURE);
 94 |     }
 95 | 
 96 |     for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
 97 |         hInData[i] = (unsigned char)(i & 0xff);
 98 |     }
 99 | 
100 |     cudaSetDevice(GPUA);
101 |     checkCudaErrors(cudaMalloc((void **)&devInData, memSize));
102 |     cudaSetDevice(GPUB);
103 |     checkCudaErrors(cudaMalloc((void **)&devOutData, memSize));
104 |     cudaSetDevice(GPUA);
105 |     checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
106 | 
107 |     checkCudaErrors(cudaEventRecord(start, 0));
108 |     for (unsigned int i = 0; i < iterNum; i++) {
109 |         checkCudaErrors(cudaMemcpy(devOutData, devInData, memSize, cudaMemcpyDeviceToDevice));
110 |     }
111 | 
112 |     checkCudaErrors(cudaEventRecord(stop, 0));
113 |     checkCudaErrors(cudaDeviceSynchronize());
114 |     checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
115 | 
116 |     // In and Out. Note: use 1000(not 1024)unit.
117 |     double time_s = elapsedTimeInMs / 1e3;
118 |     bandwidthInGBs = (memSize * (float)iterNum) / (double)1e9;
119 |     bandwidthInGBs = bandwidthInGBs / time_s;
120 | 
121 |     free(hInData);
122 |     checkCudaErrors(cudaEventDestroy(stop));
123 |     checkCudaErrors(cudaEventDestroy(start));
124 |     checkCudaErrors(cudaFree(devInData));
125 |     checkCudaErrors(cudaFree(devOutData));
126 |     return bandwidthInGBs;
127 | }
128 | 
129 | /*
130 |  *  transfer data from one devcie to another with peer-to-peer opt.
131 |  */
132 | float deviceToDeviceWithP2P(const unsigned int memSize, const unsigned int iterNum, const unsigned int GPUA,
133 |                             const unsigned int GPUB)
134 | {
135 | 
136 |     float elapsedTimeInMs = 0.0f;
137 |     float bandwidthInGBs = 0.0f;
138 |     unsigned char *devInData;
139 |     unsigned char *devOutData;
140 | 
141 |     cudaEvent_t start, stop;
142 |     checkCudaErrors(cudaEventCreate(&start));
143 |     checkCudaErrors(cudaEventCreate(&stop));
144 | 
145 |     // allocate host memory
146 |     unsigned char *hInData = (unsigned char *)malloc(memSize);
147 | 
148 |     if (hInData == 0) {
149 |         fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
150 |         exit(EXIT_FAILURE);
151 |     }
152 | 
153 |     for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
154 |         hInData[i] = (unsigned char)(i & 0xff);
155 |     }
156 |     checkCudaErrors(cudaSetDevice(GPUA));
157 |  
158 |     // enable GPUA access GPUB
159 |     checkCudaErrors(cudaDeviceEnablePeerAccess(GPUB, 0));
160 |     checkCudaErrors(cudaMalloc((void **)&devInData, memSize));
161 |     checkCudaErrors(cudaSetDevice(GPUB));
162 |     // enable GPUB access GPUA
163 |     checkCudaErrors(cudaDeviceEnablePeerAccess(GPUA, 0));
164 |     checkCudaErrors(cudaMalloc((void **)&devOutData, memSize));
165 |     checkCudaErrors(cudaSetDevice(GPUA));
166 |     checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
167 | 
168 | 
169 |     checkCudaErrors(cudaEventRecord(start, 0));
170 |     for (unsigned int i = 0; i < iterNum; i++) {
171 |         checkCudaErrors(cudaMemcpy(devOutData, devInData, memSize, cudaMemcpyDeviceToDevice));
172 |     }
173 |     checkCudaErrors(cudaEventRecord(stop, 0));
174 |     checkCudaErrors(cudaDeviceSynchronize());
175 |     checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
176 | 
177 |     // In and Out. Note: use 1000(not 1024)unit.
178 |     double time_s = elapsedTimeInMs / 1e3;
179 |     bandwidthInGBs = (memSize * (float)iterNum) / (double)1e9;
180 |     bandwidthInGBs = bandwidthInGBs / time_s;
181 | 
182 |     free(hInData);
183 |     checkCudaErrors(cudaEventDestroy(stop));
184 |     checkCudaErrors(cudaEventDestroy(start));
185 |     checkCudaErrors(cudaFree(devInData));
186 |     checkCudaErrors(cudaFree(devOutData));
187 |     return bandwidthInGBs;
188 | }
189 | 
190 | int main(int argc, char **argv)
191 | {
192 |     printf("[Device to Device Memory Opt Demo:] - Starting...\n");
193 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
194 |         printf("Usage -deviceA=n (n >= 0 for deviceID A. Default:0)\n");
195 |         printf("      -deviceB=n (n >= 0 for deviceID B. Default:1)\n");
196 |         printf("      -size=The size of memory for testing in bytes. Default: 20*1024*1024)\n");
197 |         printf("      -iter=n Iteration numbers of trans. Default:100 \n");
198 |         exit(EXIT_SUCCESS);
199 |     }
200 |     unsigned int memSize = 1024 * 1024 * 20;
201 |     unsigned int iterNumbers = 100;
202 |     unsigned int GPUA = 0;
203 |     unsigned int GPUB = 1;
204 | 
205 |     if (checkCmdLineFlag(argc, (const char **)argv, "deviceA")) {
206 |         GPUA = getCmdLineArgumentInt(argc, (const char **)argv, "deviceA");
207 |     }
208 |     if (checkCmdLineFlag(argc, (const char **)argv, "deviceB")) {
209 |         GPUB = getCmdLineArgumentInt(argc, (const char **)argv, "deviceB");
210 |     }
211 |     if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
212 |         memSize = getCmdLineArgumentInt(argc, (const char **)argv, "size");
213 |     }
214 | 
215 |     if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
216 |         iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
217 |     }
218 |     checkCudaErrors(cudaSetDevice(GPUA));
219 |     printf(">. Device to itself transfer.             Bandwith: %f GB/s\n", deviceToItself(memSize, iterNumbers));
220 |     printf(">. Device to device transfer without p2p. Bandwith: %f GB/s\n",
221 |            deviceToDeviceWithoutP2P(memSize, iterNumbers, GPUA, GPUB));
222 |     printf(">. Device to device transfer with p2p     Bandwith: %f GB/s\n",
223 |            deviceToDeviceWithP2P(memSize, iterNumbers, GPUA, GPUB));
224 | 
225 |     exit(EXIT_SUCCESS);
226 | }


--------------------------------------------------------------------------------
/memory_opt/memoryOpt.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | // System includes
  3 | #include <assert.h>
  4 | #include <stdio.h>
  5 | 
  6 | // CUDA runtime
  7 | #include <cuda_runtime.h>
  8 | 
  9 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 10 | #define STRCASECMP _stricmp
 11 | #define STRNCASECMP _strnicmp
 12 | #else
 13 | #define STRCASECMP strcasecmp
 14 | #define STRNCASECMP strncasecmp
 15 | #endif
 16 | 
 17 | template <typename T> void check(T result, char const *const func, const char *const file, int const line)
 18 | {
 19 |     if (result) {
 20 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
 21 |                 cudaGetErrorString(result), func);
 22 |         exit(EXIT_FAILURE);
 23 |     }
 24 | }
 25 | 
 26 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 27 | 
 28 | #define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M
 29 | 
 30 | #define TIME_ELAPSE(func, elapsedTime, start, stop)  \
 31 |     cudaEventCreate(&start);                         \
 32 |     cudaEventCreate(&stop);                          \
 33 |     cudaEventRecord(start, 0);                       \
 34 |     (func);                                          \
 35 |     cudaEventRecord(stop, 0);                        \
 36 |     cudaEventSynchronize(stop);                      \
 37 |     cudaEventElapsedTime(&elapsedTime, start, stop); \
 38 |     cudaEventDestroy(start);                         \
 39 |     cudaEventDestroy(stop);
 40 | 
 41 | inline int stringRemoveDelimiter(char delimiter, const char *string)
 42 | {
 43 |     int string_start = 0;
 44 | 
 45 |     while (string[string_start] == delimiter) {
 46 |         string_start++;
 47 |     }
 48 | 
 49 |     if (string_start >= static_cast<int>(strlen(string) - 1)) {
 50 |         return 0;
 51 |     }
 52 | 
 53 |     return string_start;
 54 | }
 55 | 
 56 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
 57 | {
 58 |     bool bFound = false;
 59 | 
 60 |     if (argc >= 1) {
 61 |         for (int i = 1; i < argc; i++) {
 62 |             int string_start = stringRemoveDelimiter('-', argv[i]);
 63 |             const char *string_argv = &argv[i][string_start];
 64 | 
 65 |             const char *equal_pos = strchr(string_argv, '=');
 66 |             int argv_length = static_cast<int>(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
 67 | 
 68 |             int length = static_cast<int>(strlen(string_ref));
 69 | 
 70 |             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) {
 71 |                 bFound = true;
 72 |                 continue;
 73 |             }
 74 |         }
 75 |     }
 76 | 
 77 |     return bFound;
 78 | }
 79 | 
 80 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
 81 | {
 82 |     bool bFound = false;
 83 |     int value = -1;
 84 | 
 85 |     if (argc >= 1) {
 86 |         for (int i = 1; i < argc; i++) {
 87 |             int string_start = stringRemoveDelimiter('-', argv[i]);
 88 |             const char *string_argv = &argv[i][string_start];
 89 |             int length = static_cast<int>(strlen(string_ref));
 90 | 
 91 |             if (!STRNCASECMP(string_argv, string_ref, length)) {
 92 |                 if (length + 1 <= static_cast<int>(strlen(string_argv))) {
 93 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
 94 |                     value = atoi(&string_argv[length + auto_inc]);
 95 |                 } else {
 96 |                     value = 0;
 97 |                 }
 98 | 
 99 |                 bFound = true;
100 |                 continue;
101 |             }
102 |         }
103 |     }
104 | 
105 |     if (bFound) {
106 |         return value;
107 |     } else {
108 |         return 0;
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/memory_opt/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #  GPU memory operation demo.
 3 | #  Author: kevin.xie
 4 | #  Email: kaiyuanxie@yeah.net
 5 | 
 6 | 
 7 | set -e
 8 | current_path=$(cd `dirname $0`; pwd)
 9 | make
10 | echo "Run all demo:"
11 | 
12 | if [ ! -f ${current_path}/testHost2Device ]
13 |   then
14 |     echo "testHost2Device exe file not found!"
15 |     exit 1
16 |   fi
17 | ./testHost2Device
18 | echo "[Next]"
19 | 
20 | if [ ! -f ${current_path}/testDevice2Device ]
21 |   then
22 |     echo "testDevice2Device exe file not found!"
23 |     exit 1
24 |   fi
25 | ./testDevice2Device 
26 | echo "[Next]"
27 | 
28 | if [ ! -f ${current_path}/testSharedMemory ]
29 |   then
30 |     echo "testSharedMemory exe file not found!"
31 |     exit 1
32 |   fi
33 | ./testSharedMemory 
34 | echo "[Next]"
35 | 
36 | if [ ! -f ${current_path}/testZeroCopy ]
37 |   then
38 |     echo "testZeroCopy exe file not found!"
39 |     exit 1
40 |   fi
41 | ./testZeroCopy
42 | exit 0
43 | 
44 | 


--------------------------------------------------------------------------------
/memory_opt/sharedMemory.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  Array sum calculation with or without shared memory in CUDA kernel.
  3 |  *
  4 |  *  This demo code might be stale with the development of CUDA.
  5 |  *  To use the latest API operations, you could see NVIDIA guide:
  6 |  *     https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
  7 |  *     https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
  8 |  *
  9 |  *   Author: kevin.xie
 10 |  *   Email: kaiyuanxie@yeah.net
 11 |  * */
 12 | 
 13 | #include <memory>
 14 | 
 15 | #include "memoryOpt.h"
 16 | #include "timer.h"
 17 | 
 18 | #define THREAD_PER_BLOCK 256
 19 | 
 20 | double sumArrayInBlockCPU(float *arrData, const unsigned int dataSize)
 21 | {
 22 |     /*  This function might help you understand the process of CUDA array sum. */
 23 |     float *blockData = (float *)calloc(dataSize / THREAD_PER_BLOCK, sizeof(float));
 24 |     int blockSize = dataSize / THREAD_PER_BLOCK; // get integer part
 25 |     int idxMax = blockSize * THREAD_PER_BLOCK;
 26 | 
 27 |     // Split the array into blocks and sum the blocks one by one.
 28 |     for (int i = 0; i < blockSize; i++) {
 29 |         for (int j = 0; j < THREAD_PER_BLOCK; j++) {
 30 |             int idx = i * THREAD_PER_BLOCK + j;
 31 |             while (idx < dataSize) {
 32 |                 blockData[i] += arrData[idx];
 33 |                 idx += idxMax;
 34 |             }
 35 |         }
 36 |     }
 37 | 
 38 |     double rst = 0.0;
 39 |     // sum the all blocks result;
 40 |     for (int i = 0; i < blockSize; ++i) {
 41 |         rst += blockData[i];
 42 |     }
 43 |     return rst;
 44 | }
 45 | 
 46 | __device__ int countSHM = 0;
 47 | __global__ void arraySumWithSHMKernel(float *arrData, const int dataSize)
 48 | {
 49 |     __shared__ float shm[THREAD_PER_BLOCK];
 50 |     int thIdx = threadIdx.x + blockIdx.x * blockDim.x;
 51 |     if (thIdx == 0) {
 52 |         countSHM = 0;
 53 |         __threadfence();
 54 |     }
 55 |     float val = 0.0;
 56 |     while (thIdx < dataSize) {
 57 |         val += arrData[thIdx];
 58 |         thIdx += blockDim.x * gridDim.x;
 59 |     }
 60 |     shm[threadIdx.x] = val;
 61 |     __syncthreads();
 62 | 
 63 |     for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
 64 |         if (threadIdx.x < i)
 65 |             shm[threadIdx.x] += shm[threadIdx.x + i];
 66 |         __syncthreads();
 67 |     }
 68 | 
 69 |     __syncthreads();
 70 |     bool isLast = false;
 71 |     thIdx = threadIdx.x + blockIdx.x * blockDim.x;
 72 |     if (threadIdx.x == 0) {
 73 |         arrData[blockIdx.x] = shm[0];
 74 |         __threadfence();
 75 |         int value = atomicAdd(&countSHM, 1);
 76 |         isLast = (value == gridDim.x - 1);
 77 |     }
 78 |     isLast = __syncthreads_or(isLast);
 79 |     if (isLast) {
 80 |         shm[threadIdx.x] = threadIdx.x < gridDim.x ? arrData[threadIdx.x] : 0;
 81 |         __syncthreads();
 82 |         for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
 83 |             if (threadIdx.x < i)
 84 |                 shm[threadIdx.x] += shm[threadIdx.x + i];
 85 |             __syncthreads();
 86 |         }
 87 |         __syncthreads();
 88 |         if (threadIdx.x == 0)
 89 |             arrData[0] = shm[0];
 90 |     }
 91 |     __syncthreads();
 92 | }
 93 | 
 94 | __global__ void arraySumKernel(float *arrData, float *oData, const int dataSize)
 95 | {
 96 |     // The function needed to run twice if dataSize > threads per block.
 97 | 
 98 |     int thIdx = threadIdx.x + blockIdx.x * blockDim.x;
 99 |     float val = 0.0;
100 |     while (thIdx < dataSize) {
101 |         val += arrData[thIdx];
102 |         thIdx += blockDim.x * gridDim.x;
103 |     }
104 |     thIdx = threadIdx.x + blockIdx.x * blockDim.x;
105 |     arrData[thIdx] = val;
106 |     __syncthreads();
107 | 
108 |     // Reduce process:
109 |     for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
110 |         if (threadIdx.x < i)
111 |             arrData[thIdx] += arrData[thIdx + i];
112 |         __syncthreads();
113 |     }
114 |     __syncthreads();
115 | 
116 |     if (threadIdx.x == 0) {
117 |         oData[blockIdx.x] = arrData[thIdx];
118 |     }
119 | }
120 | 
121 | float sumArrayGPU(const unsigned int dataSize, unsigned int iterNumber, bool useSHM)
122 | {
123 |     int memSize = sizeof(float) * dataSize;
124 |     float *hInData = (float *)malloc(memSize);
125 |     if (hInData == 0) {
126 |         fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
127 |         exit(EXIT_FAILURE);
128 |     }
129 | 
130 |     // Get the correct result for verifying.
131 |     double sum = sumArrayInBlockCPU(hInData, dataSize);
132 | 
133 |     float *devInData, *devOutData;
134 |     float devRst;
135 |     float elapsedTimeInMs = 0.0f;
136 |     if (!useSHM) {
137 |         checkCudaErrors(cudaMalloc((void **)&devOutData, max(dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK)));
138 |     }
139 |     checkCudaErrors(cudaMalloc((void **)&devInData, memSize));
140 |     checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
141 | 
142 |     cudaEvent_t start, stop;
143 | 
144 |     for (int i = 0; i < iterNumber; i++) {
145 |         float onceTime = 0.0;
146 |         checkCudaErrors(cudaMemcpy(devInData, hInData, memSize, cudaMemcpyHostToDevice));
147 |         if (useSHM) {
148 |             TIME_ELAPSE((arraySumWithSHMKernel<<<dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(devInData, dataSize)),
149 |                         onceTime, start, stop);
150 |         } else {
151 |             // Run twice to get the result.
152 |             TIME_ELAPSE(
153 |                 (arraySumKernel<<<dataSize / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(devInData, devOutData, dataSize)),
154 |                 onceTime, start, stop);
155 |             elapsedTimeInMs += onceTime;
156 |             TIME_ELAPSE((arraySumKernel<<<1, THREAD_PER_BLOCK>>>(devOutData, devOutData, dataSize / THREAD_PER_BLOCK)),
157 |                         onceTime, start, stop);
158 |         }
159 |         checkCudaErrors(cudaDeviceSynchronize());
160 |         elapsedTimeInMs += onceTime;
161 |     }
162 | 
163 |     if (useSHM) {
164 |         checkCudaErrors(cudaMemcpy(&devRst, devInData, sizeof(float), cudaMemcpyDeviceToHost));
165 |     } else {
166 |         checkCudaErrors(cudaMemcpy(&devRst, devOutData, sizeof(float), cudaMemcpyDeviceToHost));
167 |     }
168 | 
169 |     if (fabs(devRst - sum) > 1.e-6) {
170 |         printf("Result error! GPU: %f CPU: %f\n", devRst, sum);
171 |         exit(EXIT_FAILURE);
172 |     }
173 |     free(hInData);
174 |     checkCudaErrors(cudaFree(devInData));
175 |     if (!useSHM) {
176 |         checkCudaErrors(cudaFree(devOutData));
177 |     }
178 | 
179 |     return elapsedTimeInMs / iterNumber;
180 | }
181 | 
182 | int main(int argc, char **argv)
183 | {
184 |     printf("[Shared Memory Application: Array Sum.] - Starting...\n");
185 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
186 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
187 |         printf("      -size=The size of numElements for testing in bytes. Default: 5000)\n");
188 |         printf("      -iter=n Iteration numbers of trans. Default:100 \n");
189 |         printf("Note: The size has a limitation. Consider float type range.)\n");
190 |         exit(EXIT_SUCCESS);
191 |     }
192 |     unsigned int numElements = 5000;
193 |     unsigned int gpuID = 0;
194 |     unsigned int iterNumber = 100;
195 | 
196 |     if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
197 |         gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
198 |     }
199 |     if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
200 |         numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size");
201 |     }
202 |     if (numElements < 256 || numElements > 10000) {
203 |         printf("The size of numElements is not allowed! Support range:256~10000.\n");
204 |         printf("You could modify the source code to extend the range.\n");
205 |         exit(EXIT_FAILURE);
206 |     }
207 |     if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
208 |         iterNumber = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
209 |     }
210 | 
211 |     checkCudaErrors(cudaSetDevice(gpuID));
212 |     printf("Sum array with shared memory.       Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, true));
213 |     printf("Sum array without shared memory.    Elapsed time: %f ms \n", sumArrayGPU(numElements, iterNumber, false));
214 | 
215 |     exit(EXIT_SUCCESS);
216 | }


--------------------------------------------------------------------------------
/memory_opt/zeroCopy.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  *  zero copy using in vectorAdd case.
  4 |  *
  5 |  *  This demo code might be stale with the development of CUDA.
  6 |  *  To use the latest API operations, you could see NVIDIA guide:
  7 |  *      https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
  8 |  *      https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
  9 |  *
 10 |  *  Author: kevin.xie
 11 |  *  Email: kaiyuanxie@yeah.net
 12 |  */
 13 | 
 14 | #include "memoryOpt.h"
 15 | #include "timer.h"
 16 | 
 17 | __global__ void vectorAdd(const float *A, const float *B, float *C, const int numElements)
 18 | {
 19 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 20 |     if (i < numElements) {
 21 |         C[i] = A[i] + B[i] + 0.0f;
 22 |     }
 23 | }
 24 | 
 25 | float vectorAddViaGlobalMemory(const unsigned int numElements, const unsigned int iterNum)
 26 | {
 27 | 
 28 |     StopWatchInterface *timer = NULL;
 29 |     float elapsedTimeInMs = 0.0f;
 30 |     float throughputInGBs = 0.0f;
 31 | 
 32 |     sdkCreateTimer(&timer);
 33 |     size_t memSize = numElements * sizeof(float);
 34 | 
 35 |     // Launch the Vector Add CUDA Kernel
 36 |     int threadsPerBlock = 256;
 37 |     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 38 | 
 39 |     // Allocate the host input vector A, B, C
 40 |     float *h_A = (float *)malloc(memSize);
 41 |     float *h_B = (float *)malloc(memSize);
 42 |     float *h_C = (float *)malloc(memSize);
 43 | 
 44 |     // Verify that allocations succeeded
 45 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
 46 |         fprintf(stderr, "Failed to allocate host vectors!\n");
 47 |         exit(EXIT_FAILURE);
 48 |     }
 49 | 
 50 |     // Initialize the host input vectors
 51 |     for (int i = 0; i < numElements; ++i) {
 52 |         h_A[i] = rand() / (float)RAND_MAX;
 53 |         h_B[i] = rand() / (float)RAND_MAX;
 54 |     }
 55 | 
 56 |     // Allocate the device input vector:
 57 |     float *d_A = NULL;
 58 |     float *d_B = NULL;
 59 |     float *d_C = NULL;
 60 |     checkCudaErrors(cudaMalloc((void **)&d_A, memSize));
 61 |     checkCudaErrors(cudaMalloc((void **)&d_B, memSize));
 62 |     checkCudaErrors(cudaMalloc((void **)&d_C, memSize));
 63 | 
 64 |     for (unsigned int i = 0; i < iterNum; i++) {
 65 |         sdkStartTimer(&timer);
 66 |         checkCudaErrors(cudaMemcpy(d_A, h_A, memSize, cudaMemcpyHostToDevice));
 67 |         checkCudaErrors(cudaMemcpy(d_B, h_B, memSize, cudaMemcpyHostToDevice));
 68 |         vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
 69 |         checkCudaErrors(cudaGetLastError());
 70 |         // Copy the device result vector in device memory to the host result vector in host memory.
 71 |         checkCudaErrors(cudaMemcpy(h_C, d_C, memSize, cudaMemcpyDeviceToHost));
 72 |         sdkStopTimer(&timer);
 73 |         elapsedTimeInMs += sdkGetTimerValue(&timer);
 74 |         sdkResetTimer(&timer);
 75 |     }
 76 | 
 77 |     // Verify that the result vector is correct
 78 |     for (int i = 0; i < numElements; ++i) {
 79 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
 80 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
 81 |             exit(EXIT_FAILURE);
 82 |         }
 83 |     }
 84 | 
 85 |     // calculate throughput in GB/s. Note: use 1000(not 1024)unit.
 86 |     double time_s = elapsedTimeInMs / 1e3;
 87 |     throughputInGBs = (memSize * (float)iterNum) / (double)1e9;
 88 |     throughputInGBs = throughputInGBs / time_s;
 89 |     sdkDeleteTimer(&timer);
 90 | 
 91 |     // Free device global memory
 92 |     checkCudaErrors(cudaFree(d_A));
 93 |     checkCudaErrors(cudaFree(d_B));
 94 |     checkCudaErrors(cudaFree(d_C));
 95 | 
 96 |     // Free host memory
 97 |     free(h_A);
 98 |     free(h_B);
 99 |     free(h_C);
100 | 
101 |     return throughputInGBs;
102 | }
103 | 
104 | float vectorAddViaZeroCopy(const unsigned int numElements, const unsigned int iterNum)
105 | {
106 | 
107 |     StopWatchInterface *timer = NULL;
108 |     float elapsedTimeInMs = 0.0f;
109 |     float throughputInGBs = 0.0f;
110 | 
111 |     sdkCreateTimer(&timer);
112 |     size_t memSize = numElements * sizeof(float);
113 | 
114 |     // Launch the Vector Add CUDA Kernel
115 |     int threadsPerBlock = 256;
116 |     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
117 | 
118 |     checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
119 |     // Allocate the host input vector A, B, C
120 |     float *h_A = NULL;
121 |     float *h_B = NULL;
122 |     float *h_C = NULL;
123 |     float *map_A, *map_B, *map_C;
124 |     // Policy1:
125 |     // checkCudaErrors(cudaMallocHost((void **)&h_A, memSize));
126 |     // checkCudaErrors(cudaMallocHost((void **)&h_B, memSize));
127 |     // checkCudaErrors(cudaMallocHost((void **)&h_C, memSize));
128 | 
129 |     // Policy2:
130 |     checkCudaErrors(cudaHostAlloc((void **)&h_A, memSize, cudaHostAllocMapped));
131 |     checkCudaErrors(cudaHostAlloc((void **)&h_B, memSize, cudaHostAllocMapped));
132 |     checkCudaErrors(cudaHostAlloc((void **)&h_C, memSize, cudaHostAllocMapped));
133 | 
134 |     // Verify that allocations succeeded
135 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
136 |         fprintf(stderr, "Failed to allocate host vectors!\n");
137 |         exit(EXIT_FAILURE);
138 |     }
139 |     // Get the device pointers for the pinned CPU memory mapped into the GPU memory space.
140 |     checkCudaErrors(cudaHostGetDevicePointer(&map_A, h_A, 0));
141 |     checkCudaErrors(cudaHostGetDevicePointer(&map_B, h_B, 0));
142 |     checkCudaErrors(cudaHostGetDevicePointer(&map_C, h_C, 0));
143 | 
144 |     // Initialize the host input vectors
145 |     for (int i = 0; i < numElements; ++i) {
146 |         h_A[i] = rand() / (float)RAND_MAX;
147 |         h_B[i] = rand() / (float)RAND_MAX;
148 |     }
149 | 
150 |     // Copy the host input vectors A and B in host memory to the device input vectors in device memory
151 |     for (unsigned int i = 0; i < iterNum; i++) {
152 |         sdkStartTimer(&timer);
153 |         vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(map_A, map_B, map_C, numElements);
154 |         checkCudaErrors(cudaGetLastError());
155 |         // Copy the device result vector in device memory to the host result vector in host memory.
156 |         sdkStopTimer(&timer);
157 |         elapsedTimeInMs += sdkGetTimerValue(&timer);
158 |         sdkResetTimer(&timer);
159 |     }
160 | 
161 |     checkCudaErrors(cudaDeviceSynchronize());
162 |     // Verify that the result vector is correct
163 |     for (int i = 0; i < numElements; ++i) {
164 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
165 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
166 |             exit(EXIT_FAILURE);
167 |         }
168 |     }
169 | 
170 |     // calculate throughput in GB/s. Note: use 1000(not 1024)unit.
171 |     double time_s = elapsedTimeInMs / 1e3;
172 |     throughputInGBs = (memSize * (float)iterNum) / (double)1e9;
173 |     throughputInGBs = throughputInGBs / time_s;
174 |     sdkDeleteTimer(&timer);
175 | 
176 |     // Free host memory
177 |     checkCudaErrors(cudaFreeHost(h_A));
178 |     checkCudaErrors(cudaFreeHost(h_B));
179 |     checkCudaErrors(cudaFreeHost(h_C));
180 | 
181 |     return throughputInGBs;
182 | }
183 | 
184 | int main(int argc, char **argv)
185 | {
186 |     printf("[Zero Copy Opt Vector Add] - Starting...\n");
187 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
188 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
189 |         printf("      -size=The size of numElements for testing in bytes. Default: 5000000)\n");
190 |         printf("      -iter=n Iteration numbers of trans. Default:1 \n");
191 |         exit(EXIT_SUCCESS);
192 |     }
193 |     unsigned int numElements = 5000000;
194 |     unsigned int iterNumbers = 1;
195 |     unsigned int gpuID = 0;
196 | 
197 |     if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
198 |         gpuID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
199 |     }
200 |     if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
201 |         numElements = getCmdLineArgumentInt(argc, (const char **)argv, "size");
202 |     }
203 | 
204 |     if (checkCmdLineFlag(argc, (const char **)argv, "iter")) {
205 |         iterNumbers = getCmdLineArgumentInt(argc, (const char **)argv, "iter");
206 |     }
207 | 
208 |     checkCudaErrors(cudaSetDevice(gpuID));
209 |     cudaDeviceProp prop;
210 |     cudaGetDeviceProperties(&prop, gpuID);
211 |     if (!prop.canMapHostMemory)
212 |         exit(EXIT_FAILURE);
213 |     printf(">. Data tranfer via global memory.  VectorAdd throughput: %f GB/s\n",
214 |            vectorAddViaGlobalMemory(numElements, iterNumbers));
215 |     printf(">. Data tranfer via  zero copy.     VectorAdd throughput: %f GB/s\n",
216 |            vectorAddViaZeroCopy(numElements, iterNumbers));
217 | 
218 |     exit(EXIT_SUCCESS);
219 | }


--------------------------------------------------------------------------------
/nccl/README.md:
--------------------------------------------------------------------------------
  1 | # NCCL C++ Examples
  2 | 
  3 | | **Cases**                  | **Node require** | **Description**                                           |  
  4 | |----------------------------|------------------|-----------------------------------------------------------|
  5 | | one_device_per_thread      | 1                | One Device(1 GPU) per Process or Thread                   |
  6 | | multi_devices_per_thread   | 1                | Multiple Devices(more than one GPU) per Process or Thread |
  7 | | nonblocking_double_streams | 1                | One rank has two communicators.                           |
  8 | | nccl_with_mpi              | 1                | Run with Open MPI                                         |
  9 | | node_server/node_client    | 2                | Using socket for init                                     |
 10 | 
 11 | 
 12 | ## Compile
 13 | 
 14 | Clone this git lib to your local env, such as /home/xky/
 15 | 
 16 | Requirements:
 17 | * CUDA
 18 | * NVIDIA NCCL (optimized for NVLink)
 19 | * Open-MPI (option)
 20 | 
 21 | Recommend using docker images：
 22 | 
 23 | ```shell
 24 | docker pull nvcr.io/nvidia/pytorch:24.07-py3
 25 | ```
 26 | 
 27 | If there is docker-ce, run docker:
 28 | ```shell
 29 | sudo docker run  --net=host --gpus=all -it -e UID=root --ipc host --shm-size="32g" \
 30 | -v /home/xky/:/home/xky \
 31 | -u 0 \
 32 | --name=nccl2 nvcr.io/nvidia/pytorch:24.07-py3 bash
 33 | ```
 34 | Others:
 35 | ```shell
 36 | docker run \
 37 |   --runtime=nvidia \
 38 |   --privileged \
 39 |   --device /dev/nvidia0:/dev/nvidia0 \
 40 |   --device /dev/nvidia1:/dev/nvidia1 \
 41 |   --device /dev/nvidia2:/dev/nvidia2 \
 42 |   --device /dev/nvidia3:/dev/nvidia3 \
 43 |   --device /dev/nvidia4:/dev/nvidia4 \
 44 |   --device /dev/nvidia5:/dev/nvidia5 \
 45 |   --device /dev/nvidia6:/dev/nvidia6 \
 46 |   --device /dev/nvidia7:/dev/nvidia7 \
 47 |   --device /dev/nvidiactl:/dev/nvidiactl \
 48 |   --device /dev/nvidia-uvm:/dev/nvidia-uvm \
 49 |   --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools \
 50 |   --device /dev/infiniband:/dev/infiniband \
 51 |   -v /usr/local/bin/:/usr/local/bin/ \
 52 |   -v /opt/cloud/cce/nvidia/:/usr/local/nvidia/ \
 53 |   -v /home/xky/:/home/xky \
 54 |   --ipc host \
 55 |   --net host \
 56 |   -it \
 57 |   -u root \
 58 |   --name nccl_env \
 59 | nvcr.io/nvidia/pytorch:24.07-py3 bash
 60 | ```
 61 | 
 62 | 
 63 | Enter the git directory and run makefile
 64 | ```shell
 65 | cd /home/xky/BasicCUDA/nccl/
 66 | make
 67 | ```
 68 | If there is MPI lib in env, could compile MPI case:
 69 | ```shell
 70 | make mpi
 71 | ```
 72 | 
 73 | ## Run 
 74 | 
 75 | ### Single node
 76 | 
 77 | ```shell
 78 | ./multi_devices_per_thread
 79 | ./one_devices_per_thread
 80 | ./nonblocking_double_streams
 81 | ```
 82 | 
 83 | Set DEBUG=1 would print some debug information.  
 84 | Could change ranks number by set '--nranks'. e.g:
 85 | 
 86 | ```shell
 87 | DEBUG=1 ./nonblocking_double_streams --nranks 8
 88 | ```
 89 | 
 90 | MPI case run:
 91 | ```shell
 92 | mpirun -n 6 --allow-run-as-root ./nccl_with_mpi
 93 | ```
 94 | 
 95 | ### Multi nodes
 96 | 
 97 | Two nodes case: using socket connection for nccl init.
 98 | 
 99 | Server run in one:
100 | ```shell
101 | ./node_server
102 | ```
103 | 
104 | Client run in another one, e.g. Server IP: 10.10.1.1
105 | ```shell
106 | ./node_client --hostname 10.10.1.1
107 | ```
108 | 
109 | Add some envs:
110 | ```shell
111 | # server:
112 | NCCL_DEBUG=INFO NCCL_NET_PLUGIN=none NCCL_IB_DISABLE=1 ./node_server --port 8066 --nranks 8
113 | # client:
114 | NCCL_DEBUG=INFO  NCCL_NET_PLUGIN=none NCCL_IB_DISABLE=1 ./node_client --hostname 10.10.1.1 --port 8066  --nranks 8
115 | ```
116 | 
117 | 


--------------------------------------------------------------------------------
/nccl/alltoall.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Compile: nvcc  -lnccl -ccbin g++ -std=c++11 -O3 -g alltoall.cu -o alltoall
  3 |  *  Test: ./alltoall
  4 |  *  Profiling: nvprof --csv -o profile_output.csv ./alltoall
  5 |  *  Author: kevin.xie
  6 |  *  Email: kaiyuanxie@yeah.net
  7 |  */
  8 | 
  9 | #include "comm.h"
 10 | 
 11 | ncclUniqueId id;
 12 | pthread_mutex_t mutex;
 13 | 
 14 | void dataPrint(float *hostData, int size, int gpu_id, int my_nranks, const char *status)
 15 | {
 16 |     pthread_mutex_lock(&mutex);
 17 |     printf("GPU:%d %s data: ", gpu_id, status);
 18 |     for (int i = 0; i < size; ++i) {
 19 |         printf("%.0f ", hostData[i]);
 20 |     }
 21 |     printf("\n");
 22 |     pthread_mutex_unlock(&mutex);
 23 | }
 24 | 
 25 | ncclResult_t AlltoAll(const void *sendbuff, void *recvbuff, size_t count, ncclDataType_t type, ncclComm_t comm,
 26 |                       cudaStream_t stream)
 27 | {
 28 |     int nRanks;
 29 |     NCCLCHECK(ncclCommCount(comm, &nRanks));
 30 |     size_t rankOffset = count * wordSize(type);
 31 | 
 32 | #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
 33 |     printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
 34 |     return ncclInternalError;
 35 | #else
 36 |     NCCLCHECK(ncclGroupStart());
 37 |     for (int r = 0; r < nRanks; r++) {
 38 |         NCCLCHECK(ncclSend(((char *)sendbuff) + r * rankOffset, count, type, r, comm, stream));
 39 |         NCCLCHECK(ncclRecv(((char *)recvbuff) + r * rankOffset, count, type, r, comm, stream));
 40 |     }
 41 |     NCCLCHECK(ncclGroupEnd());
 42 |     return ncclSuccess;
 43 | #endif
 44 | }
 45 | 
 46 | void *threadAlltoAll(void *arg)
 47 | {
 48 |     int count = 1;
 49 |     int size = my_nranks * count;
 50 |     int gpu_id = *(int *)arg;
 51 |     cudaSetDevice(gpu_id);
 52 | 
 53 |     ncclComm_t comm;
 54 |     NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id));
 55 | 
 56 |     float *sendbuff;
 57 |     float *recvbuff;
 58 |     float *hostData;
 59 |     cudaStream_t s;
 60 | 
 61 |     hostData = (float *)malloc(size * sizeof(float));
 62 |     for (int i = 0; i < size; ++i) {
 63 |         // hostData[i] = float(gpu_id) * my_nranks + i;
 64 |         hostData[i] = float(gpu_id);
 65 |     }
 66 |     dataPrint(hostData, size, gpu_id, my_nranks, "input");
 67 | 
 68 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
 69 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
 70 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
 71 |     CUDACHECK(cudaStreamCreate(&s));
 72 | 
 73 |     NCCLCHECK(AlltoAll((const void *)sendbuff, (void *)recvbuff, count, ncclFloat, comm, s));
 74 |     // completing NCCL operation by synchronizing on the CUDA stream
 75 |     CUDACHECK(cudaStreamSynchronize(s));
 76 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
 77 |     dataPrint(hostData, size, gpu_id, my_nranks, "output");
 78 |     ncclCommDestroy(comm);
 79 | 
 80 |     CUDACHECK(cudaFree(sendbuff));
 81 |     CUDACHECK(cudaFree(recvbuff));
 82 |     free(hostData);
 83 | 
 84 |     return NULL;
 85 | }
 86 | 
 87 | void *threadAlltoAllIter(void *arg)
 88 | {
 89 |     int count = 2 * 1024 * 1024;
 90 |     int size = my_nranks * count;
 91 |     int gpu_id = *(int *)arg;
 92 |     cudaSetDevice(gpu_id);
 93 | 
 94 |     ncclComm_t comm;
 95 |     NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id));
 96 | 
 97 |     float *sendbuff;
 98 |     float *recvbuff;
 99 |     float *hostData;
100 |     cudaStream_t s;
101 | 
102 |     hostData = (float *)malloc(size * sizeof(float));
103 |     for (int i = 0; i < size; ++i) {
104 |         // hostData[i] = float(gpu_id) * my_nranks + i;
105 |         hostData[i] = float(gpu_id);
106 |     }
107 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
108 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
109 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
110 |     CUDACHECK(cudaStreamCreate(&s));
111 | 
112 |     for (int i = 0; i < 4; ++i) {
113 |         NCCLCHECK(AlltoAll((const void *)sendbuff, (void *)recvbuff, count, ncclFloat, comm, s));
114 |         // Sync stream to avoid data chaos.
115 |         CUDACHECK(cudaStreamSynchronize(s));
116 |     }
117 | 
118 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
119 |     ncclCommDestroy(comm);
120 | 
121 |     CUDACHECK(cudaFree(sendbuff));
122 |     CUDACHECK(cudaFree(recvbuff));
123 |     free(hostData);
124 |     return NULL;
125 | }
126 | 
127 | void runAlltoAll(ops threadFunc)
128 | {
129 |     pthread_t threads[8];
130 |     printf("====== AlltoAll case begin =====\n");
131 |     NCCLCHECK(ncclGetUniqueId(&id));
132 |     for (int i = 0; i < my_nranks; ++i) {
133 |         int *id_pointer = &gpu_ids[i];
134 |         pthread_create(&threads[i], NULL, threadFunc, id_pointer);
135 |     }
136 | 
137 |     for (int i = 0; i < my_nranks; ++i) {
138 |         pthread_join(threads[i], NULL);
139 |     }
140 |     printf("====== AlltoAll case end =====\n\n");
141 | }
142 | 
143 | ncclResult_t AlltoAllSplit(const void *sendbuff, void *recvbuff, const size_t *sendSplitList,
144 |                            const size_t *recvSplitList, ncclDataType_t type, ncclComm_t comm, cudaStream_t stream)
145 | {
146 |     int nRanks;
147 |     NCCLCHECK(ncclCommCount(comm, &nRanks));
148 |     size_t sendOffset = 0;
149 |     size_t recvOffset = 0;
150 |     NCCLCHECK(ncclGroupStart());
151 |     for (int r = 0; r < nRanks; r++) {
152 |         NCCLCHECK(ncclSend(((char *)sendbuff) + sendOffset, sendSplitList[r], type, r, comm, stream));
153 |         NCCLCHECK(ncclRecv(((char *)recvbuff) + recvOffset, recvSplitList[r], type, r, comm, stream));
154 |         sendOffset += wordSize(type) * sendSplitList[r];
155 |         recvOffset += wordSize(type) * recvSplitList[r];
156 |     }
157 |     NCCLCHECK(ncclGroupEnd());
158 |     return ncclSuccess;
159 | }
160 | 
161 | const int countTotal = 15;
162 | const size_t sendArray[4][4] = {{1, 2, 3, 4}, {4, 2, 3, 1}, {3, 2, 1, 4}, {2, 3, 4, 1}};
163 | 
164 | const size_t recvArray[4][4] = {{1, 4, 3, 2}, {2, 2, 2, 3}, {3, 3, 1, 4}, {4, 1, 4, 1}};
165 | 
166 | /*
167 | input data:
168 | GPU:0 : 0 0 0 0 0 0 0 0 0 0
169 | GPU:1 : 1 1 1 1 1 1 1 1 1 1
170 | GPU:2 : 2 2 2 2 2 2 2 2 2 2
171 | GPU:3 : 3 3 3 3 3 3 3 3 3 3
172 | 
173 | split array:
174 | sendArray set to:
175 | {{1, 2, 3, 4},
176 | {4, 2, 3, 1},
177 | {3, 2, 1, 4},
178 | {2, 3, 4, 1}};
179 | recvArray is equals to transpose(sendArray)：
180 | {{1, 4, 3, 2},
181 | {2, 2, 2, 3},
182 | {3, 3, 1, 4},
183 | {4, 1, 4, 1}};
184 | 
185 | output data：
186 | GPU:0 : 0 1 1 1 1 2 2 2 3 3
187 | GPU:1 : 0 0 1 1 2 2 3 3 3
188 | GPU:2 : 0 0 0 1 1 1 2 3 3 3 3
189 | GPU:3 : 0 0 0 0 1 2 2 2 2 3
190 | 
191 | 
192 |  */
193 | void *threadAlltoAllSplit(void *arg)
194 | {
195 |     int size = countTotal;
196 |     int gpu_id = *(int *)arg;
197 |     cudaSetDevice(gpu_id);
198 | 
199 |     ncclComm_t comm;
200 |     NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id));
201 | 
202 |     float *sendbuff;
203 |     float *recvbuff;
204 |     float *hostData;
205 |     cudaStream_t s;
206 | 
207 |     hostData = (float *)malloc(size * sizeof(float));
208 | 
209 |     for (int i = 0; i < size; ++i) {
210 |         hostData[i] = float(gpu_id);
211 |     }
212 |     int sendDataNum = 0;
213 |     int recvDataNum = 0;
214 |     for (int i = 0; i < 4; ++i) {
215 |         sendDataNum += sendArray[gpu_id][i];
216 |         recvDataNum += recvArray[gpu_id][i];
217 |     }
218 | 
219 |     dataPrint(hostData, sendDataNum, gpu_id, my_nranks, "input");
220 | 
221 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
222 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
223 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
224 |     CUDACHECK(cudaStreamCreate(&s));
225 | 
226 |     NCCLCHECK(AlltoAllSplit((const void *)sendbuff, (void *)recvbuff, sendArray[gpu_id], recvArray[gpu_id], ncclFloat,
227 |                             comm, s));
228 |     // completing NCCL operation by synchronizing on the CUDA stream
229 |     CUDACHECK(cudaStreamSynchronize(s));
230 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
231 |     dataPrint(hostData, recvDataNum, gpu_id, my_nranks, "output");
232 |     ncclCommDestroy(comm);
233 |     CUDACHECK(cudaFree(sendbuff));
234 |     CUDACHECK(cudaFree(recvbuff));
235 |     free(hostData);
236 |     return NULL;
237 | }
238 | 
239 | void runAlltoAllSplit()
240 | {
241 |     pthread_t threads[4];
242 |     NCCLCHECK(ncclGetUniqueId(&id));
243 |     if (my_nranks < 4) {
244 |         printf("AlltoAllSplit demo requires nranks>=4, but got %d.\n", my_nranks);
245 |         exit(-1);
246 |     }
247 |     // only support 4 ranks demo.
248 |     my_nranks = 4;
249 |     printf("====== AlltoAllSplit case begin =====\n");
250 |     for (int i = 0; i < my_nranks; ++i) {
251 |         int *id_pointer = &gpu_ids[i];
252 |         pthread_create(&threads[i], NULL, threadAlltoAllSplit, id_pointer);
253 |     }
254 |     for (int i = 0; i < my_nranks; ++i) {
255 |         pthread_join(threads[i], NULL);
256 |     }
257 |     printf("====== AlltoAllSplit case end =====\n\n");
258 | }
259 | 
260 | int main(int argc, char *argv[])
261 | {
262 |     env_init(argc, argv);
263 |     runAlltoAll(threadAlltoAll);
264 |     runAlltoAll(threadAlltoAllIter);
265 |     runAlltoAllSplit();
266 |     printf("Finished successfully.\n");
267 |     return 0;
268 | }
269 | 


--------------------------------------------------------------------------------
/nccl/comm.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <pthread.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <unistd.h>
  7 | 
  8 | #include <cstring>
  9 | #include <iostream>
 10 | #include <map>
 11 | #include <set>
 12 | #include <string>
 13 | 
 14 | #include "cuda_runtime.h"
 15 | #include "nccl.h"
 16 | 
 17 | #define CUDACHECK(cmd)                                                                              \
 18 |     do {                                                                                            \
 19 |         cudaError_t err = cmd;                                                                      \
 20 |         if (err != cudaSuccess) {                                                                   \
 21 |             printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
 22 |             exit(EXIT_FAILURE);                                                                     \
 23 |         }                                                                                           \
 24 |     } while (0)
 25 | 
 26 | #define NCCLCHECK(cmd)                                                                              \
 27 |     do {                                                                                            \
 28 |         ncclResult_t res = cmd;                                                                     \
 29 |         if (res != ncclSuccess) {                                                                   \
 30 |             printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(res)); \
 31 |             exit(EXIT_FAILURE);                                                                     \
 32 |         }                                                                                           \
 33 |     } while (0)
 34 | 
 35 | #define DEFAULT_DEVICES_NUM 8
 36 | int gpu_ids[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 
 37 | char *if_debug = nullptr;
 38 | std::string server_hostname = "127.0.0.1";
 39 | int server_port = 8099;
 40 | int my_nranks = 6;
 41 | 
 42 | typedef void *(*ops)(void *);
 43 | 
 44 | #define DEBUG_PRINT(info)                       \
 45 |     if (if_debug && strcasecmp(if_debug, "0") != 0) {    \
 46 |         printf("DEUBG INFO: %s\n", info); \
 47 |     }
 48 | 
 49 | const std::string help_info = "Usage: --nranks     The number of ranks/GPU\n\
 50 |         --hostname   Server IP address.\n\
 51 |         --port       To specify a port. Default: 8099 \n\
 52 | E.g. ./run --nranks 4 --port 8096\n";
 53 | 
 54 | void env_init(int argc, char* argv[])
 55 | {
 56 |     if_debug = getenv("DEBUG");
 57 |     std::map<std::string, std::string> options;
 58 |     const std::set<std::string> allow_options{"--nranks", "--hostname", "--port"};
 59 | 
 60 |     for (int i = 1; i < argc; ++i) {
 61 |         std::string arg = argv[i];
 62 | 
 63 |         if (arg.substr(0, 2) == "--") {
 64 |             std::string value;
 65 |             if (i + 1 < argc && argv[i + 1][0] != '-') {
 66 |                 value = argv[i + 1];
 67 |                 i++;
 68 |             }
 69 |             options[arg] = value;
 70 |         } else {
 71 |             std::cout << "Unknown option: " << arg << std::endl;
 72 |             std::cout << help_info << std::endl;
 73 |             exit(-1);
 74 |         }
 75 |     }
 76 | 
 77 |     for (const auto &opt : options) {
 78 |         if (allow_options.find(opt.first) == allow_options.end()) {
 79 |             std::cout << "Unknown option: " << opt.first << std::endl << help_info;
 80 |             exit(-1);
 81 |         }
 82 |     }
 83 | 
 84 |     if (options.find("--nranks") != options.end()) {
 85 |         std::cout << "Local rank size: " << options["--nranks"] << std::endl;
 86 |         my_nranks = std::stoi(options["--nranks"]);
 87 |     }
 88 |     if (options.find("--hostname") != options.end()) {
 89 |         std::cout << "The hostname: " << options["--hostname"] << std::endl;
 90 |         server_hostname = options["--hostname"];
 91 |     }
 92 |     if (options.find("--port") != options.end()) {
 93 |         std::cout << "The hostport: " << options["--port"] << std::endl;
 94 |         server_port = std::stoi(options["--port"]);
 95 |     }
 96 | }
 97 | 
 98 | static size_t wordSize(ncclDataType_t type) {
 99 |   switch(type) {
100 |     case ncclChar:
101 | #if NCCL_MAJOR >= 2
102 |     //case ncclInt8:
103 |     case ncclUint8:
104 | #endif
105 |       return 1;
106 |     case ncclHalf:
107 | #if defined(__CUDA_BF16_TYPES_EXIST__)
108 |     case ncclBfloat16:
109 | #endif
110 |     //case ncclFloat16:
111 |       return 2;
112 |     case ncclInt:
113 |     case ncclFloat:
114 | #if NCCL_MAJOR >= 2
115 |     //case ncclInt32:
116 |     case ncclUint32:
117 |     //case ncclFloat32:
118 | #endif
119 |       return 4;
120 |     case ncclInt64:
121 |     case ncclUint64:
122 |     case ncclDouble:
123 |     //case ncclFloat64:
124 |       return 8;
125 |     default: return 0;
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/nccl/multi_devices_per_thread.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html
 3 |  *  Compile: nvcc  -lnccl -ccbin g++ -std=c++11 -O3 -g multi_devices_per_thread.cu -o multi_devices_per_thread
 4 |  */
 5 | 
 6 | #include "comm.h"
 7 | 
 8 | int main(int argc, char *argv[])
 9 | {
10 |     ncclComm_t comms[DEFAULT_DEVICES_NUM];
11 |     int nDev = DEFAULT_DEVICES_NUM;
12 |     int size = 1024 * 1024;
13 | 
14 |     // allocating and initializing device buffers
15 |     float **sendbuff = (float **)malloc(nDev * sizeof(float *));
16 |     float **recvbuff = (float **)malloc(nDev * sizeof(float *));
17 |     cudaStream_t *s = (cudaStream_t *)malloc(sizeof(cudaStream_t) * nDev);
18 | 
19 |     for (int i = 0; i < nDev; ++i) {
20 |         CUDACHECK(cudaSetDevice(i));
21 |         CUDACHECK(cudaMalloc((void **)sendbuff + i, size * sizeof(float)));
22 |         CUDACHECK(cudaMalloc((void **)recvbuff + i, size * sizeof(float)));
23 |         CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
24 |         CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
25 |         CUDACHECK(cudaStreamCreate(s + i));
26 |     }
27 | 
28 |     // initializing NCCL
29 |     NCCLCHECK(ncclCommInitAll(comms, nDev, gpu_ids));
30 | 
31 |     // calling NCCL communication API. Group API is required when using
32 |     // multiple devices per thread
33 |     NCCLCHECK(ncclGroupStart());
34 |     for (int i = 0; i < nDev; ++i)
35 |         NCCLCHECK(
36 |             ncclAllReduce((const void *)sendbuff[i], (void *)recvbuff[i], size, ncclFloat, ncclSum, comms[i], s[i]));
37 |     NCCLCHECK(ncclGroupEnd());
38 | 
39 |     // synchronizing on CUDA streams to wait for completion of NCCL operation
40 |     for (int i = 0; i < nDev; ++i) {
41 |         CUDACHECK(cudaSetDevice(i));
42 |         CUDACHECK(cudaStreamSynchronize(s[i]));
43 |     }
44 | 
45 |     // free device buffers
46 |     for (int i = 0; i < nDev; ++i) {
47 |         CUDACHECK(cudaSetDevice(i));
48 |         CUDACHECK(cudaFree(sendbuff[i]));
49 |         CUDACHECK(cudaFree(recvbuff[i]));
50 |     }
51 | 
52 |     // finalizing NCCL
53 |     for (int i = 0; i < nDev; ++i)
54 |         ncclCommDestroy(comms[i]);
55 | 
56 |     printf("Success \n");
57 |     return 0;
58 | }


--------------------------------------------------------------------------------
/nccl/nccl_with_mpi.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Use MPI to connect nccl.
  3 |  *  Source code: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html
  4 |  *  Compile: nvcc -lmpi -lnccl -I/usr/local/mpi/include/ -L/usr/local/mpi/lib/  -ccbin g++ -std=c++11 -O3 -g nccl_with_mpi.cu -o test 
  5 |  *  Test: mpirun -n 6 --allow-run-as-root ./nccl_with_mpi
  6 |  */
  7 | 
  8 | #include "comm.h"
  9 | #include "mpi.h"
 10 | 
 11 | #define MPICHECK(cmd)                                                        \
 12 |     do {                                                                     \
 13 |         int e = cmd;                                                         \
 14 |         if (e != MPI_SUCCESS) {                                              \
 15 |             printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \
 16 |             exit(EXIT_FAILURE);                                              \
 17 |         }                                                                    \
 18 |     } while (0)
 19 | 
 20 | static uint64_t getHostHash(const char *string)
 21 | {
 22 |     // Based on DJB2a, result = result * 33 ^ char
 23 |     uint64_t result = 5381;
 24 |     for (int c = 0; string[c] != '\0'; c++) {
 25 |         result = ((result << 5) + result) ^ string[c];
 26 |     }
 27 |     return result;
 28 | }
 29 | 
 30 | static void getHostName(char *hostname, int maxlen)
 31 | {
 32 |     gethostname(hostname, maxlen);
 33 |     for (int i = 0; i < maxlen; i++) {
 34 |         if (hostname[i] == '.') {
 35 |             hostname[i] = '\0';
 36 |             return;
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | int main(int argc, char *argv[])
 42 | {
 43 |     int size = 32 * 1024 * 1024;
 44 | 
 45 |     int myRank, nRanks, localRank = 0;
 46 | 
 47 |     // initializing MPI
 48 |     MPICHECK(MPI_Init(&argc, &argv));
 49 |     MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
 50 |     MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
 51 | 
 52 |     // calculating localRank based on hostname which is used in selecting a GPU
 53 |     uint64_t hostHashs[nRanks];
 54 |     char hostname[1024];
 55 |     getHostName(hostname, 1024);
 56 |     hostHashs[myRank] = getHostHash(hostname);
 57 |     MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
 58 |     for (int p = 0; p < nRanks; p++) {
 59 |         if (p == myRank)
 60 |             break;
 61 |         if (hostHashs[p] == hostHashs[myRank])
 62 |             localRank++;
 63 |     }
 64 | 
 65 |     ncclUniqueId id;
 66 |     ncclComm_t comm;
 67 |     float *sendbuff, *recvbuff;
 68 |     cudaStream_t s;
 69 | 
 70 |     // get NCCL unique ID at rank 0 and broadcast it to all others
 71 |     if (myRank == 0)
 72 |         ncclGetUniqueId(&id);
 73 |     MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
 74 | 
 75 |     // picking a GPU based on localRank, allocate device buffers
 76 |     CUDACHECK(cudaSetDevice(localRank));
 77 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
 78 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
 79 |     CUDACHECK(cudaStreamCreate(&s));
 80 | 
 81 |     // initializing NCCL
 82 |     NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
 83 | 
 84 |     // communicating using NCCL
 85 |     NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s));
 86 | 
 87 |     // completing NCCL operation by synchronizing on the CUDA stream
 88 |     CUDACHECK(cudaStreamSynchronize(s));
 89 | 
 90 |     // free device buffers
 91 |     CUDACHECK(cudaFree(sendbuff));
 92 |     CUDACHECK(cudaFree(recvbuff));
 93 | 
 94 |     // finalizing NCCL
 95 |     ncclCommDestroy(comm);
 96 | 
 97 |     // finalizing MPI
 98 |     MPICHECK(MPI_Finalize());
 99 | 
100 |     printf("[MPI Rank %d] Success \n", myRank);
101 |     return 0;
102 | }


--------------------------------------------------------------------------------
/nccl/node_client.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Implement two nodes communication via socket init.
  3 |  *  Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g node_client.cu -o node_client
  4 |  *  Author: kevin.xie
  5 |  *  Email: kaiyuanxie@yeah.net
  6 |  */
  7 | 
  8 | #include <arpa/inet.h>
  9 | #include <netinet/in.h>
 10 | #include <sys/socket.h>
 11 | 
 12 | #include "comm.h"
 13 | 
 14 | ncclUniqueId id;
 15 | 
 16 | void *thread_function(void *arg)
 17 | {
 18 |     int size = 32 * 1024;
 19 |     int gpu_id = *(int *)arg;
 20 |     cudaSetDevice(gpu_id);
 21 | 
 22 |     ncclComm_t comm;
 23 |     if (if_debug)
 24 |         std::cout << "Received from server: " << id.internal << std::endl; // debug
 25 | 
 26 |     NCCLCHECK(ncclCommInitRank(&comm, my_nranks*2, id, gpu_id + my_nranks));
 27 | 
 28 |     float *sendbuff;
 29 |     float *recvbuff;
 30 |     float *hostData;
 31 |     cudaStream_t s;
 32 | 
 33 |     hostData = (float *)malloc(size * sizeof(float));
 34 |     for (int i = 0; i < size; ++i) {
 35 |         hostData[i] = float(gpu_id);
 36 |     }
 37 | 
 38 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
 39 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
 40 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
 41 |     CUDACHECK(cudaStreamCreate(&s));
 42 | 
 43 |     NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s));
 44 |     DEBUG_PRINT("============ncclAllReduce == end=====.\n");
 45 |     NCCLCHECK(ncclBroadcast((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, 0, comm, s));
 46 |     DEBUG_PRINT("============ncclBroadcast == end=====.\n");
 47 | 
 48 |     // completing NCCL operation by synchronizing on the CUDA stream
 49 |     CUDACHECK(cudaStreamSynchronize(s));
 50 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
 51 |     printf("GPU:%d data: %f.\n", gpu_id, hostData[1]);
 52 | 
 53 |     ncclCommDestroy(comm);
 54 | 
 55 |     CUDACHECK(cudaFree(sendbuff));
 56 |     CUDACHECK(cudaFree(recvbuff));
 57 |     free(hostData);
 58 | 
 59 |     return NULL;
 60 | }
 61 | 
 62 | int main(int argc, char *argv[])
 63 | {
 64 |     env_init(argc, argv);
 65 |     int sock;
 66 |     struct sockaddr_in server_addr;
 67 |     const char *message = "Hello, server!";
 68 | 
 69 |     sock = socket(AF_INET, SOCK_STREAM, 0);
 70 |     if (sock < 0) {
 71 |         std::cerr << "Cannot create socket" << std::endl;
 72 |         return 1;
 73 |     }
 74 | 
 75 |     server_addr.sin_family = AF_INET;
 76 |     server_addr.sin_port = htons(server_port);
 77 |     inet_pton(AF_INET, server_hostname.c_str(), &server_addr.sin_addr);
 78 | 
 79 |     if (connect(sock, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) {
 80 |         std::cerr << "Cannot connect to the server" << std::endl;
 81 |         return 1;
 82 |     }
 83 | 
 84 |     std::cout << "Connected to the server" << std::endl;
 85 | 
 86 |     if (send(sock, message, strlen(message), 0) < 0) {
 87 |         std::cerr << "Cannot send message" << std::endl;
 88 |         return 1;
 89 |     }
 90 | 
 91 |     ssize_t recv_size = recv(sock, id.internal, 128, 0);
 92 |     if (recv_size > 0 && if_debug) {
 93 |         std::cout << "Received from server: " << id.internal << std::endl;
 94 |     }
 95 |     close(sock);
 96 | 
 97 |     pthread_t threads[8];
 98 |     for (int i = 0; i < my_nranks; ++i) {
 99 |         int *id_pointer = &gpu_ids[i];
100 |         pthread_create(&threads[i], NULL, thread_function, id_pointer);
101 |     }
102 | 
103 |     for (int i = 0; i < my_nranks; ++i) {
104 |         pthread_join(threads[i], NULL);
105 |     }
106 | 
107 |     printf("All threads finished successfully.\n");
108 | 
109 |     return 0;
110 | }


--------------------------------------------------------------------------------
/nccl/node_server.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  *  Implement two nodes communication via socket init.
  4 |  *  Compile: nvcc -lnccl -ccbin g++ -std=c++11 -O3 -g node_server.cu -o node_server
  5 |  *  Author: kevin.xie
  6 |  *  Email: kaiyuanxie@yeah.net
  7 |  */
  8 | 
  9 | #include <arpa/inet.h>
 10 | #include <netinet/in.h>
 11 | #include <sys/socket.h>
 12 | 
 13 | #include "comm.h"
 14 | 
 15 | ncclUniqueId id;
 16 | 
 17 | void *thread_function(void *arg)
 18 | {
 19 |     int size = 32 * 1024;
 20 |     int gpu_id = *(int *)arg;
 21 |     cudaSetDevice(gpu_id);
 22 | 
 23 |     ncclComm_t comm;
 24 |     NCCLCHECK(ncclCommInitRank(&comm, my_nranks*2, id, gpu_id));
 25 |     DEBUG_PRINT("============ncclCommInitRank: init end.=============\n"); // debug
 26 | 
 27 |     float *sendbuff;
 28 |     float *recvbuff;
 29 |     float *hostData;
 30 |     cudaStream_t s;
 31 | 
 32 |     hostData = (float *)malloc(size * sizeof(float));
 33 |     for (int i = 0; i < size; ++i) {
 34 |         hostData[i] = float(gpu_id);
 35 |     }
 36 | 
 37 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
 38 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
 39 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
 40 |     CUDACHECK(cudaStreamCreate(&s));
 41 | 
 42 |     NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s));
 43 |     DEBUG_PRINT("============ncclAllReduce ===== end =====.\n");
 44 | 
 45 |     NCCLCHECK(ncclBroadcast((const void *)recvbuff, (void *)recvbuff, size, ncclFloat, 0, comm, s));
 46 |     DEBUG_PRINT("============ncclBroadcast ===== end =====.\n");
 47 | 
 48 |     // completing NCCL operation by synchronizing on the CUDA stream
 49 |     CUDACHECK(cudaStreamSynchronize(s));
 50 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
 51 |     printf("GPU:%d data: %f.\n", gpu_id, hostData[1]);
 52 | 
 53 |     CUDACHECK(cudaStreamSynchronize(s));
 54 |     ncclCommDestroy(comm);
 55 | 
 56 |     CUDACHECK(cudaFree(sendbuff));
 57 |     CUDACHECK(cudaFree(recvbuff));
 58 |     free(hostData);
 59 | 
 60 |     return NULL;
 61 | }
 62 | 
 63 | int main(int argc, char *argv[])
 64 | {
 65 |     env_init(argc, argv);
 66 |     int server_socket, client_socket;
 67 |     struct sockaddr_in server_addr, client_addr;
 68 |     socklen_t client_len = sizeof(client_addr);
 69 |     char buffer[1024];
 70 | 
 71 |     server_socket = socket(AF_INET, SOCK_STREAM, 0);
 72 |     if (server_socket < 0) {
 73 |         std::cerr << "Cannot create socket" << std::endl;
 74 |         return 1;
 75 |     }
 76 | 
 77 |     int opt = 1;
 78 |     setsockopt(server_socket, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
 79 | 
 80 |     server_addr.sin_family = AF_INET;
 81 |     server_addr.sin_addr.s_addr = INADDR_ANY;
 82 |     server_addr.sin_port = htons(server_port);
 83 | 
 84 |     if (bind(server_socket, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) {
 85 |         std::cerr << "Cannot bind" << std::endl;
 86 |         return 1;
 87 |     }
 88 | 
 89 |     if (listen(server_socket, 5) < 0) {
 90 |         std::cerr << "Cannot listen" << std::endl;
 91 |         return 1;
 92 |     }
 93 | 
 94 |     std::cout << "Server is listening on port " << server_port << std::endl;
 95 | 
 96 |     client_socket = accept(server_socket, (struct sockaddr *)&client_addr, &client_len);
 97 |     if (client_socket < 0) {
 98 |         std::cerr << "Cannot accept connection" << std::endl;
 99 |         return 1;
100 |     }
101 | 
102 |     std::cout << "Accepted connection from " << inet_ntoa(client_addr.sin_addr) << std::endl;
103 |     ssize_t recv_size = recv(client_socket, buffer, sizeof(buffer), 0);
104 |     if (recv_size > 0) {
105 |         buffer[recv_size] = '\0';
106 |         std::cout << "Received message: " << buffer << std::endl;
107 |     }
108 | 
109 |     pthread_t threads[8];
110 | 
111 |     NCCLCHECK(ncclGetUniqueId(&id));
112 |     if (if_debug)
113 |         std::cout << "=================ncclGetUniqueId================" << buffer << std::endl; // debug
114 | 
115 |     if (send(client_socket, id.internal, 128, 0) < 0) {
116 |         std::cerr << "Cannot send message to the client" << std::endl;
117 |     }
118 | 
119 |     close(client_socket);
120 |     close(server_socket);
121 | 
122 |     for (int i = 0; i < my_nranks; ++i) {
123 |         int *id_pointer = &gpu_ids[i];
124 |         pthread_create(&threads[i], NULL, thread_function, id_pointer);
125 |     }
126 | 
127 |     for (int i = 0; i < my_nranks; ++i) {
128 |         pthread_join(threads[i], NULL);
129 |     }
130 | 
131 |     printf("Server finished successfully.\n");
132 |     return 0;
133 | }


--------------------------------------------------------------------------------
/nccl/nonblocking_double_streams.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Implement an non-blocking example of overlapping communication
  3 |  *  Compile: nvcc  -lnccl -ccbin g++ -std=c++11 -O3 -g nonblocking_double_streams.cu -o nonblocking_double_streams
  4 |  *  Author: kevin.xie
  5 |  *  Email: kaiyuanxie@yeah.net
  6 |  */
  7 | 
  8 | #include <sys/time.h>
  9 | 
 10 | #include "comm.h"
 11 | 
 12 | void *allReduceOps(void *args);
 13 | 
 14 | struct ThreadArgs {
 15 |     int gpu_id;
 16 |     int global_size;
 17 |     ncclUniqueId *id;
 18 |     int uuid = -1;
 19 |     ThreadArgs(int gpu_id, int global_size, ncclUniqueId *id)
 20 |         : gpu_id(gpu_id)
 21 |         , id(id)
 22 |         , global_size(global_size) {};
 23 |     ThreadArgs(int gpu_id, int global_size, ncclUniqueId *id, int uuid)
 24 |         : gpu_id(gpu_id)
 25 |         , id(id)
 26 |         , global_size(global_size)
 27 |         , uuid(uuid) {};
 28 | };
 29 | 
 30 | bool cmpID(ncclUniqueId *id1, ncclUniqueId *id2)
 31 | {
 32 |     if (memcmp(id1->internal, id2->internal, sizeof(id2->internal)) == 0) {
 33 |         printf("id1:%p is same with id2:%p\n", id1, id2);
 34 |         return false;
 35 |     } else {
 36 |         for (int i = 0; i < 128; i++) {
 37 |             char id1_ch = (id1->internal)[i];
 38 |             char id2_ch = (id2->internal)[i];
 39 |             if (id1_ch != id2_ch)
 40 |                 printf("Id diff internal idx_%d: id1:%c id2:%c\n", i, id1_ch, id2_ch);
 41 |         }
 42 |         return true;
 43 |     }
 44 | }
 45 | 
 46 | void printTimestamp(int gpu_id, int uuid, const char *s)
 47 | {
 48 |     struct timeval now;
 49 |     struct tm timeinfo;
 50 |     if (gettimeofday(&now, NULL) == -1) {
 51 |         perror("gettimeofday");
 52 |     }
 53 |     localtime_r(&(now.tv_sec), &timeinfo);
 54 |     char time_string[80];
 55 |     strftime(time_string, sizeof(time_string), "%Y-%m-%d %H:%M:%S", &timeinfo);
 56 | 
 57 |     char time_string_with_ms[100];
 58 |     snprintf(time_string_with_ms, sizeof(time_string_with_ms), "%s.%03ld", time_string, (long)now.tv_usec / 1000);
 59 |     printf("Group: %d GPU idx: %d. The %s time: %s\n", uuid, gpu_id, s, time_string_with_ms);
 60 | }
 61 | 
 62 | class CommExec {
 63 |     int gpu_nums;
 64 |     cudaStream_t s;
 65 |     pthread_t threads[8];
 66 |     bool end_flag = true;
 67 | 
 68 | public:
 69 |     ncclUniqueId *id_ref;
 70 |     CommExec(int gpu_nums)
 71 |         : gpu_nums(gpu_nums)
 72 |     {
 73 |     }
 74 | 
 75 |     void launch(ncclUniqueId &id, ops func)
 76 |     {
 77 |         NCCLCHECK(ncclGetUniqueId(&id));
 78 |         id_ref = &id;
 79 |         for (int i = 0; i < gpu_nums; i++) {
 80 |             ThreadArgs *args = new ThreadArgs(i, gpu_nums, &id);
 81 |             pthread_create(&threads[i], NULL, func, (void *)args);
 82 |         }
 83 |         end_flag = false;
 84 |     }
 85 | 
 86 |     void launch(ncclUniqueId &id, int uuid, ops func)
 87 |     {
 88 |         NCCLCHECK(ncclGetUniqueId(&id));
 89 |         id_ref = &id;
 90 |         for (int i = 0; i < gpu_nums; i++) {
 91 |             ThreadArgs *args = new ThreadArgs(i, gpu_nums, &id, uuid);
 92 |             pthread_create(&threads[i], NULL, func, (void *)args);
 93 |         }
 94 |         end_flag = false;
 95 |     }
 96 | 
 97 |     ncclUniqueId *get()
 98 |     {
 99 |         return id_ref;
100 |     }
101 | 
102 |     void wait()
103 |     {
104 | 
105 |         for (int i = 0; i < gpu_nums; ++i) {
106 |             pthread_join(threads[i], NULL);
107 |         }
108 |         end_flag = true;
109 |     }
110 | 
111 |     ~CommExec()
112 |     {
113 |         if (!end_flag)
114 |             wait();
115 |     }
116 | };
117 | 
118 | void *allReduceOps(void *args)
119 | {
120 |     size_t size = 2e9;
121 |     ThreadArgs *threadArgs = (struct ThreadArgs *)args;
122 |     int gpu_id = threadArgs->gpu_id;
123 |     cudaSetDevice(gpu_id);
124 |     ncclComm_t comm;
125 |     ncclResult_t state;
126 |     ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
127 |     config.blocking = 0;
128 | 
129 |     ncclCommInitRankConfig(&comm, threadArgs->global_size, *(threadArgs->id), gpu_id, &config);
130 |     do {
131 |         NCCLCHECK(ncclCommGetAsyncError(comm, &state));
132 |     } while (state == ncclInProgress);
133 | 
134 |     float *sendbuff;
135 |     float *recvbuff;
136 |     float *hostData;
137 |     cudaStream_t s;
138 | 
139 |     hostData = (float *)malloc(size * sizeof(float));
140 |     for (int i = 0; i < 20; ++i) {
141 |         hostData[i] = float(gpu_id);
142 |     }
143 | 
144 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
145 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
146 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
147 | 
148 |     CUDACHECK(cudaStreamCreate(&s));
149 |     CUDACHECK(cudaDeviceSynchronize());
150 |     if (if_debug)
151 |         printTimestamp(gpu_id, threadArgs->uuid, "start");
152 |     NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s));
153 |     // In non-blocking mode, the elapsed time has no reference.
154 |     if (if_debug)
155 |         printTimestamp(gpu_id, threadArgs->uuid, "first iter end");
156 | 
157 |     for (int i = 0; i < 50; ++i)
158 |         NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s));
159 | 
160 |     // completing NCCL operation by synchronizing on the CUDA stream
161 |     CUDACHECK(cudaStreamSynchronize(s));
162 |     if (if_debug)
163 |         printTimestamp(gpu_id, threadArgs->uuid, "end");
164 | 
165 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
166 |     printf("GPU:%d data: %f.\n", gpu_id, hostData[1]);
167 | 
168 |     ncclCommDestroy(comm);
169 |     CUDACHECK(cudaFree(sendbuff));
170 |     CUDACHECK(cudaFree(recvbuff));
171 |     free(hostData);
172 |     return NULL;
173 | }
174 | 
175 | int main(int argc, char *argv[])
176 | {
177 |     env_init(argc, argv);
178 |     ncclUniqueId id1;
179 |     ncclUniqueId id2;
180 |     CommExec commexec1(my_nranks);
181 |     CommExec commexec2(my_nranks);
182 |     commexec1.launch(id1, 1, allReduceOps);
183 |     commexec2.launch(id2, 2, allReduceOps);
184 |     if (if_debug)
185 |         cmpID(commexec1.get(), commexec2.get());
186 |     commexec2.wait();
187 |     commexec1.wait();
188 |     printf("All streams finished successfully.\n");
189 |     return 0;
190 | }
191 | 


--------------------------------------------------------------------------------
/nccl/one_device_per_thread.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html
 3 |  *  Compile: nvcc  -lnccl -ccbin g++ -std=c++11 -O3 -g one_devices_per_thread.cu.cu -o one_devices_per_thread
 4 |  */
 5 | 
 6 | #include "comm.h"
 7 | 
 8 | ncclUniqueId id;
 9 | 
10 | void *thread_function(void *arg)
11 | {
12 |     int size = 32 * 1024 * 1024;
13 |     int gpu_id = *(int *)arg;
14 |     cudaSetDevice(gpu_id);
15 | 
16 |     ncclComm_t comm;
17 |     NCCLCHECK(ncclCommInitRank(&comm, my_nranks, id, gpu_id));
18 | 
19 |     float *sendbuff;
20 |     float *recvbuff;
21 |     float *hostData;
22 |     cudaStream_t s;
23 | 
24 |     hostData = (float *)malloc(size * sizeof(float));
25 |     for (int i = 0; i < size; ++i) {
26 |         hostData[i] = float(gpu_id);
27 |     }
28 | 
29 |     CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
30 |     CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
31 |     cudaMemcpy(sendbuff, hostData, size * sizeof(float), cudaMemcpyHostToDevice);
32 |     CUDACHECK(cudaStreamCreate(&s));
33 | 
34 |     NCCLCHECK(ncclAllReduce((const void *)sendbuff, (void *)recvbuff, size, ncclFloat, ncclSum, comm, s));
35 |     cudaMemcpy(hostData, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost);
36 |     printf("GPU:%d data: %f.\n", gpu_id, hostData[1]);
37 | 
38 |     // completing NCCL operation by synchronizing on the CUDA stream
39 |     CUDACHECK(cudaStreamSynchronize(s));
40 |     ncclCommDestroy(comm);
41 | 
42 |     CUDACHECK(cudaFree(sendbuff));
43 |     CUDACHECK(cudaFree(recvbuff));
44 |     free(hostData);
45 | 
46 |     return NULL;
47 | }
48 | 
49 | int main(int argc, char *argv[])
50 | {
51 |     env_init(argc, argv);
52 |     pthread_t threads[8];
53 |     NCCLCHECK(ncclGetUniqueId(&id));
54 |     for (int i = 0; i < my_nranks; ++i) {
55 |         int *id_pointer = &gpu_ids[i];
56 |         pthread_create(&threads[i], NULL, thread_function, id_pointer);
57 |     }
58 | 
59 |     for (int i = 0; i < my_nranks; ++i) {
60 |         pthread_join(threads[i], NULL);
61 |     }
62 | 
63 |     printf("Finished successfully.\n");
64 |     return 0;
65 | }


--------------------------------------------------------------------------------
/pytorch/torch1.13_mem_rationale/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Memory Cuda Allocator Test
 2 | *Objective of this Submodule*: Compiling and executing the cudaCachingAllocator derived from the PyTorch source code presents a noteworthy challenge, especially when attempting to test individual segments of the c10 cuda components. 
 3 | The primary aim of this submodule is to distill and streamline the source code to ensure it can be effortlessly executed within various testing frameworks.
 4 | 
 5 | ## Comilple & Run
 6 | 
 7 | Compile:
 8 | ```
 9 | ./make
10 | ```
11 | You would get an "allocator_test" exe file, then run it.
12 | 
13 | ```
14 | ./allocator_test
15 | ```
16 | 
17 | Add GPU architecture: change SM in Makefile: L255
18 | 
19 | ```shell
20 | SMS ?= 35 37 50 52 60 61 70 75 80 86 90
21 | ```
22 | eg. SMS=80 means supports A100/A800 
23 | 
24 | SMS=90, H100/H800
25 | 
26 | 
27 | Note: Your version CUDA nvcc might get "unsupported gpu architecture 'compute_35'" error. Delete SMS 35.
28 | 


--------------------------------------------------------------------------------
/pytorch/torch1.13_mem_rationale/TestAllocator.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  The cuda caching allocator tests
 3 |  *  Author: kevin.xie
 4 |  *  Email: kaiyuanxie@yeah.net
 5 |  * */
 6 | 
 7 | #include "CUDACachingAllocator.h"
 8 | 
 9 | int main()
10 | {
11 |     testDeviceCachingAllocator();
12 |     testDeviceCachingAllocatorE2E();
13 |     testDeviceCachingAllocatorSmallManagement();
14 |     testDeviceCachingAllocatorFragment();
15 |     return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Extension Custom C++/CUDA 
 2 | 
 3 | Help you learn how to bind/replace a c++/CUDA function to PyTorch python.
 4 | 
 5 | ## Case 1: easyJIT
 6 | 
 7 | Requirements:
 8 | 1. Pytorch (> 1.8 is better)
 9 | 2. Ninja
10 | 3. CUDA
11 | 
12 | 
13 | Run:
14 | ```
15 | $ cd easyJIT
16 | $ python run.py
17 | ```
18 | 
19 | Common issues:
20 | 1. RuntimeError: Ninja is required to load C++ extensions
21 | 
22 | Solution: pip install Ninja
23 | 
24 | This example implement a custom c++ function to print tensor array.
25 | The lines of the code is less than 20. It's the first step make you
26 | know the process. The key elements as follows:
27 | 
28 | 1. pybind11: binding custom code to python.
29 | 2. #include <torch/extension.h> : includes pytorch defined func/param/kernel. e.g. torch::tensor
30 | 3. from torch.utils.cpp_extension import load: call Ninja JIT to compile the code and import it to python.
31 | 
32 | ## Case 2: easySetup
33 | 
34 | Run:
35 | ```
36 | $ cd easySetup
37 | $ python setup install
38 | $ python run.py
39 | ```
40 | Use setup method does not need to compile code every time. It installs the extension as a
41 | python module. 
42 | 
43 | ## Case 3: sumArray
44 | 
45 | Run:
46 | ```
47 | $ cd sumArray
48 | $ python run.py
49 | ```
50 | 
51 | This example shows how to use CUDA kernel to accomplish custom sum of a tensor array.
52 | You might find custom one runs faster than torch.sum().
53 | 
54 | Result likes:
55 | 
56 | ```
57 | ...
58 | Loading extension module sum_array...
59 | tensor(24969.7930, device='cuda:0')
60 | tensor(24969.7930, device='cuda:0')
61 | The torch original sum func test:
62 |     Elapsed time: 0.07710027694702148
63 | The custom define sum func test:
64 |     Elapsed time: 0.06388998031616211
65 | ```
66 | 
67 | ## Case 4: lltmDemo
68 | 
69 | Run custom lltm with JIT:
70 | ```
71 | $ cd lltmDemo
72 | $ python run_custom_lltm.py
73 | ```
74 | 
75 | Run PyTorch API baseline:
76 | ```
77 | $ python run_baseline.py
78 | ```
79 | Result e.g.:
80 | ```
81 | ...
82 | Custom lltm_cuda result:
83 | Forward: min:0.130 ms avg:0.134 ms | Backward min: 0.240 ms avg: 0.246 ms
84 | ...
85 | PyTorch baseline result:
86 | Forward: min:0.121 ms avg:0.142 ms | Backward min: 0.427 ms avg: 0.483 ms
87 | ```
88 | 
89 | ### Chinese Doc
90 | [PyTorch Custom CUDA/C++](https://zhuanlan.zhihu.com/p/579395211) 
91 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ## Step One (Basic examples)
  4 | 
  5 | We connect c++ code with python calling through a lib: pybind11. Before you start, make sure it's installed correctly
  6 | 
  7 | Simple way:
  8 | ```shell
  9 | pip install pybind11
 10 | ```
 11 | ### Case1: How to call function.
 12 | Related files in "basics" file:
 13 | * “functions.cc". Your c++ function implementation.  
 14 | * ”function_call.py“. A demo shows call the c++ functions.  
 15 | 
 16 | pybind11 provides a way connect c++ with python:
 17 | 
 18 | The bind operations are all in a head file, it does not relay on any other lib.
 19 | Using syntax is as follows:
 20 | 
 21 | Include head file and refer namespace:
 22 | ```c++
 23 | #include <pybind11/pybind11.h>
 24 | namespace py = pybind11;
 25 | ```
 26 | Create glue func/cls:
 27 | ```c++
 28 | PYBIND11_MODULE(functions, m) {
 29 |     m.doc() = "pybind11 example plugin";  // optional module docstring. Could be printed by python help().
 30 |     m.def("add", &add, "A function that adds two numbers");
 31 | }
 32 | ```
 33 | parameter explain:
 34 | * PYBIND11_MODULE() macro: Create functions and classes for python calling.
 35 | * functions: The module created to import in python env.
 36 | * m: A variable of type py::module_ which is the main interface for creating bindings.
 37 | * .doc : Define module docstring.
 38 | * .def : Define a python func for calling.
 39 |   * parameter-"add": The function name in python, can be changed to any others;
 40 |   * parameter-&add: The c++ pointer of target function.
 41 |   * parameter-"...": A description of this function.
 42 | 
 43 | Running the snippet:
 44 | 1 Use c++ compiling a python lib.
 45 | functions.
 46 | ```shell
 47 | # Compiler: g++/gcc.
 48 | g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) functions.cc -o functions.so
 49 | ```
 50 | 
 51 | 2 To run ”function_call.py“, you'll see info below:
 52 | 
 53 | ```
 54 | Add called, input numbers: i=3 j=4
 55 | 7
 56 | ```
 57 | 
 58 | **Note:**
 59 | pybind11 python pkg provides binding libs. You can see them in its lib.
 60 | ```shell
 61 | echo $(python3 -m pybind11 --includes)
 62 | ```
 63 | Printing info on console likes:
 64 | ```
 65 | -I/home/kaiyuan/anaconda3/envs/py3.9/include/python3.9 -I/home/kaiyuan/anaconda3/envs/py3.9/lib/python3.9/site-packages/pybind11/include
 66 | ```
 67 | 
 68 | ### Case2: How to call objects.
 69 | 
 70 | This example shows c++ class basic calling and how to deal with overload and inheritance.
 71 | 
 72 | Related files in "step_one file":
 73 | * “classes.cc". Your c++ classes implementation.
 74 | * ”classes_call.py“. A demo shows call the c++ obj.  
 75 | 
 76 | 1. Compile to get .so:
 77 | ```shell
 78 | g++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) classes.cc -o classes.so
 79 | ```
 80 | 
 81 | 2. To run ”classes_call.py“, you'll see info below:
 82 | 
 83 | ```
 84 | The profile of this shape:
 85 | Desription:Basic example of c++ class
 86 | Num:1
 87 | Area:10.000000
 88 | The profile of this shape:
 89 | Desription:Basic example of c++ class
 90 | Num:2
 91 | Area:20.000000
 92 | 100.0
 93 | ```
 94 | 
 95 | ## Advanced Practice
 96 | 
 97 | 
 98 | ## Building
 99 | 
100 | Both python setuptools and cmake are available, which deeps on your need.
101 | 
102 | There are two good examples on pybind github. 
103 | 
104 | * python example: https://github.com/pybind/python_example
105 | * cmake example: https://github.com/pybind/cmake_example
106 | 
107 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/basics/classes.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <pybind11/pybind11.h>
 3 | #include <iostream>
 4 | 
 5 | namespace py = pybind11;
 6 | 
 7 | class Shape {
 8 |     int num;
 9 |     float area;
10 | 
11 |    public:
12 |     std::string profile;
13 |     Shape(int num, float area, const std::string& profile) : num(num), area(area), profile(profile) {}
14 |     void setProperty(int num_) { num = num_; }
15 |     void setProperty(float area_) { area = area_; }  // function overload
16 |     float getArea() const { return area; }
17 |     std::string getProfile() const { return "Desription:" + profile + "\nNum:" + std::to_string(num) + "\nArea:" + std::to_string(area); }
18 | };
19 | 
20 | class Rectangle : public Shape {
21 |     int length;
22 |     int width;
23 |    public:
24 |     Rectangle(int num, int length, int width, const std::string& profile)
25 |         : Shape(num, length * width, profile), length(length), width(width) {}
26 |     void resetSize(int length, int width) {
27 |         this->length = length;
28 |         this->width = width;
29 |     }
30 | };
31 | 
32 | PYBIND11_MODULE(classes, m) {
33 |     py::class_<Shape>(m, "Shape")
34 |         .def(py::init<int, float, const std::string&>())
35 |         .def("setProperty", static_cast<void (Shape::*)(int)>(&Shape::setProperty), "Set the shape property of num.")
36 |         .def("setProperty", static_cast<void (Shape::*)(float)>(&Shape::setProperty), "Set the shape property of area.")
37 |         .def("__repr__", [](Shape& shape) { return "The profile of this shape:\n" + shape.getProfile(); });
38 |     py::class_<Rectangle, Shape>(m, "Rectangle")
39 |         .def(py::init<int, int, int, const std::string&>())
40 |         .def("getArea", static_cast<float (Rectangle::*)() const>(&Rectangle::getArea), "Get the area.")
41 |         .def("resetSize", static_cast<void (Rectangle::*)(int, int)>(&Rectangle::resetSize), "Reset size of rectangle.")
42 |         .def_readwrite("profile", &Rectangle::profile);
43 | }
44 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/basics/classes_call.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # The c++ .so pkg is needed before importing.
 4 | import classes
 5 | 
 6 | """
 7 | The snippet shows how to invoke a bond class in python, as below:
 8 | """
 9 | 
10 | # Create obj from c++ class.
11 | shape = classes.Shape(1, 10, "Basic example of c++ class")
12 | print(shape)
13 | 
14 | # Change the area while invoke "setProperty" func with 'float' tpye input.
15 | shape.setProperty(20.0)
16 | 
17 | # "setProperty" func has been overloaded, thus it can also change the num.
18 | shape.setProperty(2)
19 | 
20 | # Then we check the info:
21 | print(shape)
22 | 
23 | # It could raise error while call a function not defined in "PYBIND11_MODULE"
24 | # shape.getProfile()
25 | 
26 | """
27 | Inheritance example:
28 | """
29 | 
30 | rect = classes.Rectangle(1, 10, 10, "Inheritance example.")
31 | print(rect.getArea())
32 | 
33 | # Profile attribution could be read&write:
34 | rect.profile = "The profile description has been changed!"
35 | print(rect)
36 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/basics/function_call.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | import functions
4 | 
5 | print(functions.add(3, 4))
6 | 
7 | # if you want to know info created by pybind using help():
8 | # help(functions.add)
9 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/basics/functions.cc:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <iostream>
 3 | 
 4 | int add(int i = 2, int j = 3) {
 5 |     std::cout << "Add called, input numbers: i=" << i << " j=" << j << std::endl;
 6 |     return i + j;
 7 | }
 8 | 
 9 | PYBIND11_MODULE(functions, m) {
10 |     m.doc() = "pybind11 example plugin";  // optional module docstring. Could be printed by python help().
11 |     m.def("add", &add, "A function that adds two numbers");
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/classes_lib.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_lib.cc


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/classes_lib.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_lib.h


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/classes_lib_bind.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_lib_bind.cc


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/classes_practice.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/classes_practice.py


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/functions_lib.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <pybind11/pybind11.h>
  3 | #include <iostream>
  4 | 
  5 | namespace py = pybind11;
  6 | 
  7 | /**
  8 |  * Function source code
  9 |  *
 10 |  * **/
 11 | 
 12 | int addTwoNum(int a, int b){
 13 |     return a + b;
 14 | }
 15 | 
 16 | /**
 17 |  * overload functions
 18 |  * **/
 19 | 
 20 | void printInfo(int digis) {
 21 |    std::cout << "Your input is integer:" << std::to_string(digis) << std::endl;
 22 | }
 23 | 
 24 | void printInfo(float digis) {
 25 |     std::cout << "Your input is string:" << std::to_string(digis) << std::endl;
 26 | }
 27 | 
 28 | /**
 29 | * inplace case:
 30 | **/
 31 | 
 32 | void inplaceAdd(int& src, int increment) {
 33 |     src += increment;
 34 | }
 35 | 
 36 | struct Data{
 37 |     int num=0;
 38 | };
 39 | 
 40 | void inplaceAddV2(Data& data, int increment) {
 41 |     data.num += increment;
 42 | }
 43 | 
 44 | void setDataPtr100(Data* data) {
 45 |     data->num = 100;
 46 | }
 47 | 
 48 | /**
 49 | * global variable:
 50 | **/
 51 | int worldCount = 9;
 52 | 
 53 | /**
 54 |  * template function
 55 |  * **/
 56 | template <typename T>
 57 | T multiply(const T& a, const T& b) {
 58 |     return a * b;
 59 | }
 60 | 
 61 | /**
 62 |  * Allow/Prohibiting None arguments
 63 |  * **/
 64 | void showDataNum(Data* data) {
 65 |     if (data) {
 66 |         std::cout << "The data.num:" << data->num << std::endl;
 67 |         return;
 68 |     }
 69 |     std::cout << "No data input" << std::endl;
 70 | }
 71 | 
 72 | /**
 73 |  * recall function
 74 |  * **/
 75 | 
 76 | typedef int (*FUN)(int);
 77 | 
 78 | int addOne(int a){
 79 |     a += 1;
 80 |     return a;
 81 | }
 82 | 
 83 | void recallFunc(FUN f) {
 84 |     int a = 10;
 85 |     a = f(a);
 86 | }
 87 | 
 88 | 
 89 | PYBIND11_MODULE(functions, m) {
 90 |     m.def("add_two_num", &addTwoNum, "Input int a and int b,return a + b");
 91 |     m.def("add_two_num_with_default",  &addTwoNum, "default a=1, b=2", py::arg("a")=1, py::arg("b")=2);
 92 |     m.def("printInfo", static_cast<void (*)(float)>(&printInfo), "Overload examples", py::arg("digis"));
 93 |     m.def("inplace_add", &inplaceAdd, "Expect: input(&a, b), a += b, but it does not work.");
 94 |     m.def("inplace_add_use_struct", &inplaceAddV2, "data.num += b");
 95 |     py::class_<Data>(m, "Data")
 96 |       .def(py::init<>())
 97 |       .def_readwrite("num", &Data::num);
 98 |     m.def("set_data_ptr_100", &setDataPtr100, "data->num= 100");
 99 |     m.attr("worldCount")=worldCount;
100 |     m.def("multiply", &multiply<float>);
101 |     m.def("multiply", &multiply<int>);
102 |     m.def("multiply_float", &multiply<double>, py::arg("a").noconvert(), py::arg("b").noconvert());
103 |     m.def("show_data_num", &showDataNum, py::arg("data").none(false));
104 |     m.def("show_data_num_allow_none", &showDataNum, py::arg("data").none(true));
105 | }
106 | 
107 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/functions_lib.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/functions_lib.h


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/functions_lib_bind.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/BasicCUDA/5bf47d64954274aa90bd0b9bb01c2de7462bf513/pytorch/torch_ext/binding_examples/bind_practices/functions_lib_bind.cc


--------------------------------------------------------------------------------
/pytorch/torch_ext/binding_examples/bind_practices/functions_practice.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import functions
 4 | 
 5 | def divide_print(info):
 6 |     print("\n", "-"*40, "\n", "-"*5, info, "\n", "-"*40)
 7 | 
 8 | # Inference will not work
 9 | divide_print(" Add case")
10 | print("4+3=", functions.add_two_num(4, 3))
11 | print("With default args, the result is:", functions.add_two_num_with_default())
12 | 
13 | # Functions overload, the last one works:
14 | divide_print(" overlaod case")
15 | functions.printInfo(10)
16 | 
17 | 
18 | # Certain basic Python types (like str, int, bool, float, etc.) are immutable. See "Limitations involving reference arguments"
19 | # Inference will not work
20 | divide_print(" Inplace case")
21 | num = 10
22 | print("Before inplace opt, num: ", num)
23 | functions.inplace_add(num, 10)
24 | print("After inplace opt, num: ", num)
25 | 
26 | # Sturct data type is OK while using inplace operation
27 | divide_print("Inplace  case (corrected):")
28 | data = functions.Data()
29 | data.num = 10
30 | print("Before inplace opt, data.num: ", data.num)
31 | functions.inplace_add_use_struct(data, 4)
32 | print("After inplace opt, data.num: ",data.num)
33 | 
34 | # data pointer
35 | divide_print("Function with struct ptr variable in c++ called in python")
36 | data.num = 0
37 | functions.set_data_ptr_100(data)
38 | print(data.num)
39 | 
40 | # Call a variable:
41 | # Sturct data type is OK while using inplace operation
42 | divide_print("Global variable:")
43 | print("Print the variable:",functions.worldCount)
44 | functions.worldCount = 2
45 | print("Change the variable:",functions.worldCount)
46 | 
47 | # Template:
48 | divide_print("Template: multiply(T, T)")
49 | print("int * int:", functions.multiply(2, 3))
50 | print("float * float:", functions.multiply(2.0, 3.0))
51 | 
52 | # Explicit args, no convert:
53 | print("float * float (not allow convert):")
54 | 
55 | try:
56 |     functions.multiply_float(1,3)
57 | except TypeError as e:
58 |     print("TypeError Case: \n", e)
59 | 
60 | # Allow/Prohibiting None arguments
61 | divide_print("Allow/Prohibiting None arguments")
62 | functions.show_data_num(data)
63 | try:
64 |     # Run with None, will raise an error:
65 |     functions.show_data_num(None)
66 | except TypeError as e:
67 |     print("TypeError Case: \n", e)
68 | functions.show_data_num_allow_none(None) # That's ok.
69 | 
70 | # Recall function
71 | 
72 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_jit/demo.cu:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void printArray(torch::Tensor input) {
 4 |     int *ptr = (int *)input.data_ptr();
 5 |     for(int i=0; i <  input.numel(); i++) {
 6 |         printf("%d\n", ptr[i]);
 7 |     }
 8 | }
 9 | 
10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11 |     m.def("print_array", &printArray, "");
12 | }


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_jit/run.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.cpp_extension import load
3 | 
4 | ext_module = load(name="demo",  sources=["demo.cu"], verbose=True)
5 | print("Module directory: ", ext_module.__file__)
6 | ext_module.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int))
7 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_load/run_inline_v1.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load_inline
 3 | 
 4 | cpp_src = """
 5 | #include <torch/extension.h>
 6 | 
 7 | void printArray(torch::Tensor input) {
 8 |     int *ptr = (int *)input.data_ptr();
 9 |     for(int i=0; i <  input.numel(); i++) {
10 |         printf("%d\\n", ptr[i]);
11 |     }
12 | }
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 |     m.def("print_array", &printArray, "");
16 | }
17 | """
18 | 
19 | ext_module = load_inline(name="print_array", cpp_sources=cpp_src, verbose=True)
20 | ext_module.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int))


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_load/run_inline_v2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load_inline
 3 | 
 4 | cpp_src = """
 5 | #include <torch/script.h>
 6 | 
 7 | void printArray(torch::Tensor input) {
 8 |     int *ptr = (int *)input.data_ptr();
 9 |     for(int i=0; i <  input.numel(); i++) {
10 |         printf("%d\\n", ptr[i]);
11 |     }
12 | }
13 | 
14 | static auto registry = torch::RegisterOperators("new_ops::print_array", &printArray);
15 | """
16 | 
17 | load_inline(name="print_array", cpp_sources=cpp_src, is_python_module=False, verbose=True)
18 | torch.ops.new_ops.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int))


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_load/run_inline_v3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.cpp_extension import load_inline
 3 | 
 4 | cpp_src = """
 5 | #include <torch/script.h>
 6 | 
 7 | void printArray(torch::Tensor input) {
 8 |     int *ptr = (int *)input.data_ptr();
 9 |     for(int i = 0; i <  input.numel(); i++) {
10 |         printf("%d\\n", ptr[i]);
11 |     }
12 | }
13 | 
14 | void printReverseArray(torch::Tensor input) {
15 |     int *ptr = (int *)input.data_ptr();
16 |     for(int i = input.numel()-1; i >= 0; --i) {
17 |         printf("%d\\n", ptr[i]);
18 |     }
19 | }
20 | 
21 | static auto registry = torch::RegisterOperators("new_ops::print_array", &printArray)
22 |                        .op("new_ops::print_reverse_array", &printReverseArray);
23 | """
24 | 
25 | load_inline(name="print_array", cpp_sources=cpp_src, is_python_module=False, verbose=True)
26 | torch.ops.new_ops.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int))
27 | print("Reverse:")
28 | torch.ops.new_ops.print_reverse_array(torch.tensor([4, 3, 2, 1], dtype=torch.int))
29 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_setup/my_extension.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void printArray(torch::Tensor input) {
 4 |     int *ptr = (int *)input.data_ptr();
 5 |     for(int i=0; i <  input.numel(); i++) {
 6 |         printf("%d\n", ptr[i]);
 7 |     }
 8 | }
 9 | 
10 | PYBIND11_MODULE(my_extension, m) {
11 |     m.def("print_array", &printArray, "");
12 | }
13 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_setup/run.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import my_extension
3 | 
4 | 
5 | my_extension.print_array(torch.tensor([4, 3, 2, 1], dtype=torch.int))
6 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/easy_setup/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3 | 
4 | setup(name='my_extension',
5 |       ext_modules=[CUDAExtension('my_extension', ['my_extension.cpp']),],
6 |       cmdclass={'build_ext': BuildExtension})
7 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/lltm_demo/lltm_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /* torch cuda custom example */
 2 | #include <torch/extension.h>
 3 | 
 4 | #include <vector>
 5 | 
 6 | // CUDA forward declarations
 7 | 
 8 | std::vector<torch::Tensor> lltm_cuda_forward(
 9 |     torch::Tensor input,
10 |     torch::Tensor weights,
11 |     torch::Tensor bias,
12 |     torch::Tensor old_h,
13 |     torch::Tensor old_cell);
14 | 
15 | std::vector<torch::Tensor> lltm_cuda_backward(
16 |     torch::Tensor grad_h,
17 |     torch::Tensor grad_cell,
18 |     torch::Tensor new_cell,
19 |     torch::Tensor input_gate,
20 |     torch::Tensor output_gate,
21 |     torch::Tensor candidate_cell,
22 |     torch::Tensor X,
23 |     torch::Tensor gate_weights,
24 |     torch::Tensor weights);
25 | 
26 | // C++ interface
27 | 
28 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
29 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
30 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
31 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
32 | 
33 | std::vector<torch::Tensor> lltm_forward(
34 |     torch::Tensor input,
35 |     torch::Tensor weights,
36 |     torch::Tensor bias,
37 |     torch::Tensor old_h,
38 |     torch::Tensor old_cell) {
39 |   CHECK_INPUT(input);
40 |   CHECK_INPUT(weights);
41 |   CHECK_INPUT(bias);
42 |   CHECK_INPUT(old_h);
43 |   CHECK_INPUT(old_cell);
44 | 
45 |   return lltm_cuda_forward(input, weights, bias, old_h, old_cell);
46 | }
47 | 
48 | std::vector<torch::Tensor> lltm_backward(
49 |     torch::Tensor grad_h,
50 |     torch::Tensor grad_cell,
51 |     torch::Tensor new_cell,
52 |     torch::Tensor input_gate,
53 |     torch::Tensor output_gate,
54 |     torch::Tensor candidate_cell,
55 |     torch::Tensor X,
56 |     torch::Tensor gate_weights,
57 |     torch::Tensor weights) {
58 |   CHECK_INPUT(grad_h);
59 |   CHECK_INPUT(grad_cell);
60 |   CHECK_INPUT(input_gate);
61 |   CHECK_INPUT(output_gate);
62 |   CHECK_INPUT(candidate_cell);
63 |   CHECK_INPUT(X);
64 |   CHECK_INPUT(gate_weights);
65 |   CHECK_INPUT(weights);
66 | 
67 |   return lltm_cuda_backward(
68 |       grad_h,
69 |       grad_cell,
70 |       new_cell,
71 |       input_gate,
72 |       output_gate,
73 |       candidate_cell,
74 |       X,
75 |       gate_weights,
76 |       weights);
77 | }
78 | 
79 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
80 |   m.def("forward", &lltm_forward, "LLTM forward (CUDA)");
81 |   m.def("backward", &lltm_backward, "LLTM backward (CUDA)");
82 | }
83 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/lltm_demo/lltm_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | /* torch cuda custom example */
  2 | 
  3 | #include <torch/extension.h>
  4 | 
  5 | #include <cuda.h>
  6 | #include <cuda_runtime.h>
  7 | 
  8 | #include <vector>
  9 | 
 10 | namespace {
 11 | template <typename scalar_t>
 12 | __device__ __forceinline__ scalar_t sigmoid(scalar_t z) {
 13 |   return 1.0 / (1.0 + exp(-z));
 14 | }
 15 | 
 16 | template <typename scalar_t>
 17 | __device__ __forceinline__ scalar_t d_sigmoid(scalar_t z) {
 18 |   const auto s = sigmoid(z);
 19 |   return (1.0 - s) * s;
 20 | }
 21 | 
 22 | template <typename scalar_t>
 23 | __device__ __forceinline__ scalar_t d_tanh(scalar_t z) {
 24 |   const auto t = tanh(z);
 25 |   return 1 - (t * t);
 26 | }
 27 | 
 28 | template <typename scalar_t>
 29 | __device__ __forceinline__ scalar_t elu(scalar_t z, scalar_t alpha = 1.0) {
 30 |   return fmaxf(0.0, z) + fminf(0.0, alpha * (exp(z) - 1.0));
 31 | }
 32 | 
 33 | template <typename scalar_t>
 34 | __device__ __forceinline__ scalar_t d_elu(scalar_t z, scalar_t alpha = 1.0) {
 35 |   const auto e = exp(z);
 36 |   const auto d_relu = z < 0.0 ? 0.0 : 1.0;
 37 |   return d_relu + (((alpha * (e - 1.0)) < 0.0) ? (alpha * e) : 0.0);
 38 | }
 39 | 
 40 | template <typename scalar_t>
 41 | __global__ void lltm_cuda_forward_kernel(
 42 |     const torch::PackedTensorAccessor<scalar_t,3,torch::RestrictPtrTraits,size_t> gates,
 43 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> old_cell,
 44 |     torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> new_h,
 45 |     torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> new_cell,
 46 |     torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> input_gate,
 47 |     torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> output_gate,
 48 |     torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> candidate_cell) {
 49 |   //batch index
 50 |   const int n = blockIdx.y;
 51 |   // column index
 52 |   const int c = blockIdx.x * blockDim.x + threadIdx.x;
 53 |   if (c < gates.size(2)){
 54 |     input_gate[n][c] = sigmoid(gates[n][0][c]);
 55 |     output_gate[n][c] = sigmoid(gates[n][1][c]);
 56 |     candidate_cell[n][c] = elu(gates[n][2][c]);
 57 |     new_cell[n][c] =
 58 |         old_cell[n][c] + candidate_cell[n][c] * input_gate[n][c];
 59 |     new_h[n][c] = tanh(new_cell[n][c]) * output_gate[n][c];
 60 |   }
 61 | }
 62 | 
 63 | template <typename scalar_t>
 64 | __global__ void lltm_cuda_backward_kernel(
 65 |     torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> d_old_cell,
 66 |     torch::PackedTensorAccessor<scalar_t,3,torch::RestrictPtrTraits,size_t> d_gates,
 67 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> grad_h,
 68 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> grad_cell,
 69 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> new_cell,
 70 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> input_gate,
 71 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> output_gate,
 72 |     const torch::PackedTensorAccessor<scalar_t,2,torch::RestrictPtrTraits,size_t> candidate_cell,
 73 |     const torch::PackedTensorAccessor<scalar_t,3,torch::RestrictPtrTraits,size_t> gate_weights) {
 74 |   //batch index
 75 |   const int n = blockIdx.y;
 76 |   // column index
 77 |   const int c = blockIdx.x * blockDim.x + threadIdx.x;
 78 |   if (c < d_gates.size(2)){
 79 |     const auto d_output_gate = tanh(new_cell[n][c]) * grad_h[n][c];
 80 |     const auto d_tanh_new_cell = output_gate[n][c] * grad_h[n][c];
 81 |     const auto d_new_cell =
 82 |         d_tanh(new_cell[n][c]) * d_tanh_new_cell + grad_cell[n][c];
 83 | 
 84 | 
 85 |     d_old_cell[n][c] = d_new_cell;
 86 |     const auto d_candidate_cell = input_gate[n][c] * d_new_cell;
 87 |     const auto d_input_gate = candidate_cell[n][c] * d_new_cell;
 88 | 
 89 |     d_gates[n][0][c] =
 90 |         d_input_gate * d_sigmoid(gate_weights[n][0][c]);
 91 |     d_gates[n][1][c] =
 92 |         d_output_gate * d_sigmoid(gate_weights[n][1][c]);
 93 |     d_gates[n][2][c] =
 94 |         d_candidate_cell * d_elu(gate_weights[n][2][c]);
 95 |   }
 96 | }
 97 | } // namespace
 98 | 
 99 | std::vector<torch::Tensor> lltm_cuda_forward(
100 |     torch::Tensor input,
101 |     torch::Tensor weights,
102 |     torch::Tensor bias,
103 |     torch::Tensor old_h,
104 |     torch::Tensor old_cell) {
105 |   auto X = torch::cat({old_h, input}, /*dim=*/1);
106 |   auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1));
107 | 
108 |   const auto batch_size = old_cell.size(0);
109 |   const auto state_size = old_cell.size(1);
110 | 
111 |   auto gates = gate_weights.reshape({batch_size, 3, state_size});
112 |   auto new_h = torch::zeros_like(old_cell);
113 |   auto new_cell = torch::zeros_like(old_cell);
114 |   auto input_gate = torch::zeros_like(old_cell);
115 |   auto output_gate = torch::zeros_like(old_cell);
116 |   auto candidate_cell = torch::zeros_like(old_cell);
117 | 
118 |   const int threads = 1024;
119 |   const dim3 blocks((state_size + threads - 1) / threads, batch_size);
120 | 
121 |   AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] {
122 |     lltm_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
123 |         gates.packed_accessor<scalar_t,3,torch::RestrictPtrTraits,size_t>(),
124 |         old_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
125 |         new_h.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
126 |         new_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
127 |         input_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
128 |         output_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
129 |         candidate_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>());
130 |   }));
131 | 
132 |   return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates};
133 | }
134 | 
135 | std::vector<torch::Tensor> lltm_cuda_backward(
136 |     torch::Tensor grad_h,
137 |     torch::Tensor grad_cell,
138 |     torch::Tensor new_cell,
139 |     torch::Tensor input_gate,
140 |     torch::Tensor output_gate,
141 |     torch::Tensor candidate_cell,
142 |     torch::Tensor X,
143 |     torch::Tensor gates,
144 |     torch::Tensor weights) {
145 |   auto d_old_cell = torch::zeros_like(new_cell);
146 |   auto d_gates = torch::zeros_like(gates);
147 | 
148 |   const auto batch_size = new_cell.size(0);
149 |   const auto state_size = new_cell.size(1);
150 | 
151 |   const int threads = 1024;
152 |   const dim3 blocks((state_size + threads - 1) / threads, batch_size);
153 | 
154 |   AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] {
155 |     lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
156 |         d_old_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
157 |         d_gates.packed_accessor<scalar_t,3,torch::RestrictPtrTraits,size_t>(),
158 |         grad_h.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
159 |         grad_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
160 |         new_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
161 |         input_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
162 |         output_gate.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
163 |         candidate_cell.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>(),
164 |         gates.packed_accessor<scalar_t,3,torch::RestrictPtrTraits,size_t>());
165 |   }));
166 | 
167 |   auto d_gate_weights = d_gates.flatten(1, 2);
168 |   auto d_weights = d_gate_weights.t().mm(X);
169 |   auto d_bias = d_gate_weights.sum(/*dim=*/0, /*keepdim=*/true);
170 | 
171 |   auto d_X = d_gate_weights.mm(weights);
172 |   auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size);
173 |   auto d_input = d_X.slice(/*dim=*/1, state_size);
174 | 
175 |   return {d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates};
176 | }


--------------------------------------------------------------------------------
/pytorch/torch_ext/lltm_demo/run_baseline.py:
--------------------------------------------------------------------------------
  1 | # torch cuda custom example #
  2 | 
  3 | import math
  4 | import time
  5 | 
  6 | from torch import nn
  7 | from torch.autograd import Function
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def d_sigmoid(z):
 13 |     s = torch.sigmoid(z)
 14 |     return (1 - s) * s
 15 | 
 16 | 
 17 | def d_tanh(z):
 18 |     t = torch.tanh(z)
 19 |     return 1 - (t * t)
 20 | 
 21 | 
 22 | def d_elu(z, alpha=1.0):
 23 |     e = z.exp()
 24 |     mask = (alpha * (e - 1)) < 0
 25 |     return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e)
 26 | 
 27 | 
 28 | class LLTMFunction(Function):
 29 |     @staticmethod
 30 |     def forward(ctx, input, weights, bias, old_h, old_cell):
 31 |         X = torch.cat([old_h, input], dim=1)
 32 | 
 33 |         gate_weights = F.linear(X, weights, bias)
 34 |         gates = gate_weights.chunk(3, dim=1)
 35 | 
 36 |         input_gate = torch.sigmoid(gates[0])
 37 |         output_gate = torch.sigmoid(gates[1])
 38 |         candidate_cell = F.elu(gates[2])
 39 | 
 40 |         new_cell = old_cell + candidate_cell * input_gate
 41 |         new_h = torch.tanh(new_cell) * output_gate
 42 | 
 43 |         ctx.save_for_backward(X, weights, input_gate, output_gate, old_cell,
 44 |                               new_cell, candidate_cell, gate_weights)
 45 | 
 46 |         return new_h, new_cell
 47 | 
 48 |     @staticmethod
 49 |     def backward(ctx, grad_h, grad_cell):
 50 |         X, weights, input_gate, output_gate, old_cell = ctx.saved_variables[:5]
 51 |         new_cell, candidate_cell, gate_weights = ctx.saved_variables[5:]
 52 | 
 53 |         d_input = d_weights = d_bias = d_old_h = d_old_cell = None
 54 | 
 55 |         d_output_gate = torch.tanh(new_cell) * grad_h
 56 |         d_tanh_new_cell = output_gate * grad_h
 57 |         d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell
 58 | 
 59 |         d_old_cell = d_new_cell
 60 |         d_candidate_cell = input_gate * d_new_cell
 61 |         d_input_gate = candidate_cell * d_new_cell
 62 | 
 63 |         gates = gate_weights.chunk(3, dim=1)
 64 |         d_input_gate *= d_sigmoid(gates[0])
 65 |         d_output_gate *= d_sigmoid(gates[1])
 66 |         d_candidate_cell *= d_elu(gates[2])
 67 | 
 68 |         d_gates = torch.cat(
 69 |             [d_input_gate, d_output_gate, d_candidate_cell], dim=1)
 70 | 
 71 |         if ctx.needs_input_grad[1]:
 72 |             d_weights = d_gates.t().mm(X)
 73 |         if ctx.needs_input_grad[2]:
 74 |             d_bias = d_gates.sum(dim=0, keepdim=True)
 75 |         if ctx.needs_input_grad[3] or ctx.needs_input_grad[4]:
 76 |             d_X = d_gates.mm(weights)
 77 |             state_size = grad_h.shape[1]
 78 |             d_old_h, d_input = d_X[:, :state_size], d_X[:, state_size:]
 79 | 
 80 |         return d_input, d_weights, d_bias, d_old_h, d_old_cell
 81 | 
 82 | 
 83 | class LLTM(nn.Module):
 84 |     def __init__(self, input_features, state_size):
 85 |         super(LLTM, self).__init__()
 86 |         self.input_features = input_features
 87 |         self.state_size = state_size
 88 |         self.weights = nn.Parameter(
 89 |             torch.Tensor(3 * state_size, input_features + state_size))
 90 |         self.bias = nn.Parameter(torch.Tensor(1, 3 * state_size))
 91 |         self.reset_parameters()
 92 | 
 93 |     def reset_parameters(self):
 94 |         stdv = 1.0 / math.sqrt(self.state_size)
 95 |         for weight in self.parameters():
 96 |             weight.data.uniform_(-stdv, +stdv)
 97 | 
 98 |     def forward(self, input, state):
 99 |         return LLTMFunction.apply(input, self.weights, self.bias, *state)
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     torch.manual_seed(42)
104 |     device = torch.device("cuda")
105 |     dtype = torch.float32
106 |     kwargs = {'dtype': dtype,
107 |               'device': device,
108 |               'requires_grad': True}
109 |     batch_size = 32
110 |     features = 32
111 |     state_size = 256
112 |     iter_nums = 100
113 | 
114 |     X = torch.randn(batch_size, features, **kwargs)
115 |     h = torch.randn(batch_size, state_size, **kwargs)
116 |     C = torch.randn(batch_size, state_size, **kwargs)
117 |     rnn = LLTM(features, state_size).to(device, dtype)
118 |     # Force CUDA initialization
119 |     new_h, new_C = rnn(X, (h, C))
120 |     (new_h.sum() + new_C.sum()).backward()
121 | 
122 |     forward_min = math.inf
123 |     forward_time = 0
124 |     backward_min = math.inf
125 |     backward_time = 0
126 | 
127 |     for _ in range(iter_nums):
128 |         rnn.zero_grad()
129 |         start = time.time()
130 |         new_h, new_C = rnn(X, (h, C))
131 |         elapsed = time.time() - start
132 |         forward_min = min(forward_min, elapsed)
133 |         forward_time += elapsed
134 | 
135 |         start = time.time()
136 |         (new_h.sum() + new_C.sum()).backward()
137 |         elapsed = time.time() - start
138 |         backward_min = min(backward_min, elapsed)
139 |         backward_time += elapsed
140 | 
141 |     forward_min *= 1000
142 |     backward_min *= 1000
143 |     forward_average = forward_time / iter_nums * 1000
144 |     backward_average = backward_time / iter_nums * 1000
145 | 
146 |     print("PyTorch baseline result:")
147 |     print('Forward: min:{0:.3f} ms avg:{1:.3f} ms | Backward min: {2:.3f} '
148 |           'ms avg: {3:.3f} ms'.format(forward_min, forward_average,
149 |                                       backward_min, backward_average,))
150 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/lltm_demo/run_custom_lltm.py:
--------------------------------------------------------------------------------
  1 | # torch cuda custom example #
  2 | 
  3 | import math
  4 | from torch import nn
  5 | from torch.autograd import Function
  6 | import torch
  7 | import time
  8 | try:
  9 |     import lltm_cuda
 10 | except ImportError as e:
 11 |     print("lltm_cuda.so is not found! Use JIT compiling....")
 12 |     from torch.utils.cpp_extension import load
 13 |     lltm_cuda = load(
 14 |         'lltm_cuda', ['lltm_cuda.cpp', 'lltm_cuda_kernel.cu']) # verbose=True 
 15 |     print("lltm_cuda dir:", lltm_cuda.__file__)
 16 | 
 17 | 
 18 | class LLTMFunction(Function):
 19 |     @staticmethod
 20 |     def forward(ctx, input, weights, bias, old_h, old_cell):
 21 |         outputs = lltm_cuda.forward(input, weights, bias, old_h, old_cell)
 22 |         new_h, new_cell = outputs[:2]
 23 |         variables = outputs[1:] + [weights]
 24 |         ctx.save_for_backward(*variables)
 25 | 
 26 |         return new_h, new_cell
 27 | 
 28 |     @staticmethod
 29 |     def backward(ctx, grad_h, grad_cell):
 30 |         outputs = lltm_cuda.backward(
 31 |             grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors)
 32 |         d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates = outputs
 33 |         return d_input, d_weights, d_bias, d_old_h, d_old_cell
 34 | 
 35 | 
 36 | class LLTM(nn.Module):
 37 |     def __init__(self, input_features, state_size):
 38 |         super(LLTM, self).__init__()
 39 |         self.input_features = input_features
 40 |         self.state_size = state_size
 41 |         self.weights = nn.Parameter(
 42 |             torch.Tensor(3 * state_size, input_features + state_size))
 43 |         self.bias = nn.Parameter(torch.Tensor(1, 3 * state_size))
 44 |         self.reset_parameters()
 45 | 
 46 |     def reset_parameters(self):
 47 |         stdv = 1.0 / math.sqrt(self.state_size)
 48 |         for weight in self.parameters():
 49 |             weight.data.uniform_(-stdv, +stdv)
 50 | 
 51 |     def forward(self, input, state):
 52 |         return LLTMFunction.apply(input, self.weights, self.bias, *state)
 53 | 
 54 | 
 55 | if __name__ == "__main__":
 56 |     torch.manual_seed(42)
 57 |     device = torch.device("cuda")
 58 |     dtype = torch.float32
 59 |     kwargs = {'dtype': dtype,
 60 |               'device': device,
 61 |               'requires_grad': True}
 62 |     batch_size = 32
 63 |     features = 32
 64 |     state_size = 256
 65 |     iter_nums = 100
 66 | 
 67 |     X = torch.randn(batch_size, features, **kwargs)
 68 |     h = torch.randn(batch_size, state_size, **kwargs)
 69 |     C = torch.randn(batch_size, state_size, **kwargs)
 70 |     rnn = LLTM(features, state_size).to(device, dtype)
 71 |     # Force CUDA initialization
 72 |     new_h, new_C = rnn(X, (h, C))
 73 |     (new_h.sum() + new_C.sum()).backward()
 74 | 
 75 |     forward_min = math.inf
 76 |     forward_time = 0
 77 |     backward_min = math.inf
 78 |     backward_time = 0
 79 | 
 80 |     for _ in range(iter_nums):
 81 |         rnn.zero_grad()
 82 |         start = time.time()
 83 |         new_h, new_C = rnn(X, (h, C))
 84 |         elapsed = time.time() - start
 85 |         forward_min = min(forward_min, elapsed)
 86 |         forward_time += elapsed
 87 | 
 88 |         start = time.time()
 89 |         (new_h.sum() + new_C.sum()).backward()
 90 |         elapsed = time.time() - start
 91 |         backward_min = min(backward_min, elapsed)
 92 |         backward_time += elapsed
 93 | 
 94 |     forward_min *= 1000
 95 |     backward_min *= 1000
 96 |     forward_average = forward_time / iter_nums * 1000
 97 |     backward_average = backward_time / iter_nums * 1000
 98 | 
 99 |     print("Custom lltm_cuda result: ")
100 |     print('Forward: min:{0:.3f} ms avg:{1:.3f} ms | Backward min: {2:.3f} '
101 |           'ms avg: {3:.3f} ms'.format(forward_min, forward_average,
102 |                                       backward_min, backward_average,))
103 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/lltm_demo/setup.py:
--------------------------------------------------------------------------------
 1 | # torch cuda custom example #
 2 | 
 3 | from setuptools import setup
 4 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 5 | 
 6 | setup(
 7 |     name='lltm_cuda',
 8 |     ext_modules=[
 9 |         CUDAExtension('lltm_cuda', [
10 |             'lltm_cuda.cpp',
11 |             'lltm_cuda_kernel.cu',
12 |         ]),
13 |     ],
14 |     cmdclass={
15 |         'build_ext': BuildExtension
16 |     })


--------------------------------------------------------------------------------
/pytorch/torch_ext/sum_array/glueCode.cpp:
--------------------------------------------------------------------------------
 1 | #include "sumArray.h"
 2 | #include <torch/extension.h>
 3 | 
 4 | 
 5 | torch::Tensor torchSumArray(torch::Tensor input) {
 6 |     int dataSize = input.numel();
 7 |     float* devInData = (float *)input.data_ptr();
 8 |     arraySumCUDA(devInData, dataSize);
 9 |     return input[0];
10 | }
11 | 
12 | PYBIND11_MODULE(sum_array, m) {
13 |     m.def("sum_array", &torchSumArray, "");
14 | }


--------------------------------------------------------------------------------
/pytorch/torch_ext/sum_array/run.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | from torch.utils.cpp_extension import load
 4 | 
 5 | ext_module = load(name="sum_array",
 6 |                   extra_include_paths=["./"] ,
 7 |                   sources=["sumArray.cu", "glueCode.cpp"],
 8 |                   verbose=True)
 9 | 
10 | 
11 | def iter_test(func):
12 |     delta_t = 0
13 |     for _ in range(10000):
14 |         _tensor = torch.rand(50000, dtype=torch.float, device='cuda')
15 |         t1 = time.time()
16 |         func(_tensor)
17 |         t2 = time.time()
18 |         delta_t += t2-t1
19 |     print("    Elapsed time:", delta_t)
20 |     return delta_t
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     # warm up:
25 |     in_tensor = torch.rand(50000, dtype=torch.float, device='cuda')
26 |     print(torch.sum(in_tensor))
27 |     print(ext_module.sum_array(in_tensor.clone()))
28 | 
29 |     # time test:
30 |     print("The torch original sum func test:")
31 |     iter_test(torch.sum)
32 |     print("The custom define sum func test:")
33 |     iter_test(ext_module.sum_array)
34 |     


--------------------------------------------------------------------------------
/pytorch/torch_ext/sum_array/sumArray.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  PyTorch extension cuda example: sum array.
 3 |  *  Author: kevin.xie
 4 |  *  Email: kaiyuanxie@yeah.net
 5 |  * */
 6 | #include <cmath>
 7 | #include "sumArray.h"
 8 | 
 9 | 
10 | __device__ int countSHM = 0;
11 | __global__ void arraySumWithSHMKernel(float *arrData, const int dataSize)
12 | {
13 |     __shared__ float shm[THREAD_PER_BLOCK];
14 |     int thIdx = threadIdx.x + blockIdx.x * blockDim.x;
15 |     if (thIdx == 0) {
16 |         countSHM = 0;
17 |         __threadfence();
18 |     }
19 |     float val = 0.0;
20 |     while (thIdx < dataSize) {
21 |         val += arrData[thIdx];
22 |         thIdx += blockDim.x * gridDim.x;
23 |     }
24 |     shm[threadIdx.x] = val;
25 |     __syncthreads();
26 | 
27 |     for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
28 |         if (threadIdx.x < i)
29 |             shm[threadIdx.x] += shm[threadIdx.x + i];
30 |         __syncthreads();
31 |     }
32 | 
33 |     __syncthreads();
34 |     bool isLast = false;
35 |     thIdx = threadIdx.x + blockIdx.x * blockDim.x;
36 |     if (threadIdx.x == 0) {
37 |         arrData[blockIdx.x] = shm[0];
38 |         __threadfence();
39 |         int value = atomicAdd(&countSHM, 1);
40 |         isLast = (value == gridDim.x - 1);
41 |     }
42 |     isLast = __syncthreads_or(isLast);
43 |     if (isLast) {
44 |         shm[threadIdx.x] = threadIdx.x < gridDim.x ? arrData[threadIdx.x] : 0;
45 |         __syncthreads();
46 |         for (int i = THREAD_PER_BLOCK / 2; i >= 1; i /= 2) {
47 |             if (threadIdx.x < i)
48 |                 shm[threadIdx.x] += shm[threadIdx.x + i];
49 |             __syncthreads();
50 |         }
51 |         __syncthreads();
52 |         if (threadIdx.x == 0)
53 |             arrData[0] = shm[0];
54 |     }
55 |     __syncthreads();
56 | }
57 | 
58 | void arraySumCUDA(float *arrData, const int dataSize) {
59 |     int grid = max(dataSize / THREAD_PER_BLOCK, 1);
60 |     arraySumWithSHMKernel<<<grid, THREAD_PER_BLOCK>>>(arrData, dataSize);
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/pytorch/torch_ext/sum_array/sumArray.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  PyTorch extension cuda example: sum array.
 3 |  *  Author: kevin.xie
 4 |  *  Email: kaiyuanxie@yeah.net
 5 |  * */
 6 | 
 7 | #pragma once
 8 | 
 9 | // CUDA runtime
10 | #include <cuda_runtime.h>
11 | #define THREAD_PER_BLOCK 256
12 | 
13 | 
14 | void arraySumCUDA(float *, const int);


--------------------------------------------------------------------------------
/pytorch/torch_mem_snapshot/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch memory snapshot examples
 2 | 
 3 | ## API ref：
 4 | 
 5 | * https://pytorch.org/docs/main/torch_cuda_memory.html#understanding-cuda-memory-usage
 6 | * https://pytorch.org/docs/main/profiler.html
 7 | 
 8 | ## To visualize the graph
 9 | 
10 | drag the pickle file to: “https://pytorch.org/memory_viz”


--------------------------------------------------------------------------------
/pytorch/torch_mem_snapshot/block_fragment.py:
--------------------------------------------------------------------------------
 1 | # Author: kevin.xie  zhihu@kaiyuan
 2 | 
 3 | import torch
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | def segment_example(device="cuda:0"):
 8 |     tensor1 = torch.randn(size=(10,1024, 1024, 512), device=device)
 9 |     tensor1.to("cpu")
10 |     # free tensor1 ,the segment will be freed as well.
11 |     del tensor1
12 |     torch.cuda.empty_cache()
13 |     # create a new segment and a new block for tensor2
14 |     tensor2 = torch.rand(size=(1, 1024, 512), device=device)
15 |     tensor3 = torch.rand(size=(12, 1024, 512), device=device)
16 | 
17 | 
18 | def run():
19 |     # Start recording memory snapshot history
20 |     torch.cuda.memory._record_memory_history(max_entries=100000)
21 | 
22 |     # example running:
23 |     segment_example()
24 | 
25 |     timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
26 |     file_name = f"visual_mem_{timestamp}.pickle"
27 |     # save record:
28 |     torch.cuda.memory._dump_snapshot(file_name)
29 | 
30 |     # Stop recording memory snapshot history:
31 |     torch.cuda.memory._record_memory_history(enabled=None)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     run()
36 | 


--------------------------------------------------------------------------------
/pytorch/torch_mem_snapshot/segment.py:
--------------------------------------------------------------------------------
 1 | # Author: kevin.xie  zhihu@kaiyuan
 2 | 
 3 | import torch
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | def segment_example(device="cuda:0"):
 8 |     tensor1 = torch.randn(size=(10,1024, 1024, 512), device=device)
 9 |     tensor1.to("cpu")
10 |     # free tensor1 ,the segment will be freed as well.
11 |     del tensor1
12 |     torch.cuda.empty_cache()
13 |     # create a new segment and a new block for tensor2
14 |     tensor2 = torch.rand(size=(1, 1024, 512), device=device)
15 |     
16 |     tensor_group = []
17 |     for _ in range(10):
18 |         tensor_group.append(torch.rand(size=(1024, 1024, 512), device=device))
19 | 
20 | 
21 | def run():
22 |     # Start recording memory snapshot history
23 |     torch.cuda.memory._record_memory_history(max_entries=100000)
24 | 
25 |     # example running:
26 |     segment_example()
27 | 
28 |     timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
29 |     file_name = f"visual_mem_{timestamp}.pickle"
30 |     # save record:
31 |     torch.cuda.memory._dump_snapshot(file_name)
32 | 
33 |     # Stop recording memory snapshot history:
34 |     torch.cuda.memory._record_memory_history(enabled=None)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     run()
39 | 
40 | 


--------------------------------------------------------------------------------
/pytorch/torch_mem_snapshot/transformer_profile.py:
--------------------------------------------------------------------------------
 1 | # Author: kevin.xie  zhihu@kaiyuan
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from datetime import datetime
 6 | from torch.autograd.profiler import record_function
 7 | 
 8 | 
 9 | def trace_handler(prof: torch.profiler.profile):
10 |    # Prefix for file names.
11 |    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
12 |    file_name = f"visual_mem_{timestamp}.pickle"
13 | 
14 |    # Construct the trace file.
15 |    prof.export_chrome_trace(f"{file_name}.json.gz")
16 | 
17 |    # Construct the memory timeline file.
18 |    prof.export_memory_timeline(f"{file_name}.html", device="cuda:0")
19 | 
20 | 
21 | def train(num_iter=5, device="cuda:0"):
22 |     model = nn.Transformer(d_model=512, nhead=2, num_encoder_layers=2, num_decoder_layers=2).to(device=device)
23 |     x = torch.randn(size=(1, 1024, 512), device=device)
24 |     tgt = torch.rand(size=(1, 1024, 512), device=device)
25 |     model.train()
26 |     labels = torch.rand_like(model(x, tgt))
27 |     criterion = torch.nn.CrossEntropyLoss()
28 |     optimizer = torch.optim.Adam(model.parameters())
29 |     with torch.profiler.profile(
30 |             activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
31 |             schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1),
32 |             record_shapes=True,
33 |             profile_memory=True,
34 |             with_stack=True,
35 |             on_trace_ready=trace_handler,
36 |     ) as prof:
37 |         for _ in range(num_iter):
38 |             prof.step()
39 |             with record_function("## forward ##"):
40 |                 y = model(x, tgt)
41 | 
42 |             with record_function("## backward ##"):
43 |                 loss = criterion(y, labels)
44 |                 loss.backward()
45 |                 print(loss.item())
46 | 
47 |             with record_function("## optimizer ##"):
48 |                 optimizer.step()
49 |                 optimizer.zero_grad(set_to_none=True)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     # warm-up:
54 |     train(1)
55 |     # run:
56 |     train(3)
57 | 
58 | 


--------------------------------------------------------------------------------
/pytorch/torch_mem_snapshot/transformer_snapshot.py:
--------------------------------------------------------------------------------
 1 | # Author: kevin.xie  zhihu@kaiyuan
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from datetime import datetime
 6 | 
 7 | 
 8 | def train(num_iter=5, device="cuda:0"):
 9 |     model = nn.Transformer(d_model=512, nhead=2, num_encoder_layers=2, num_decoder_layers=2).to(device=device)
10 |     x = torch.randn(size=(1, 1024, 512), device=device)
11 |     tgt = torch.rand(size=(1, 1024, 512), device=device)
12 |     model.train()
13 |     labels = torch.rand_like(model(x, tgt))
14 |     criterion = torch.nn.CrossEntropyLoss()
15 |     optimizer = torch.optim.Adam(model.parameters())
16 |     for _ in range(num_iter):
17 |         y = model(x, tgt)
18 |         loss = criterion(y, labels)
19 |         loss.backward()
20 |         print(loss.item())
21 |         optimizer.step()
22 |         optimizer.zero_grad(set_to_none=True)
23 | 
24 | 
25 | def run():
26 |     # Start recording memory snapshot history
27 |     torch.cuda.memory._record_memory_history(max_entries=100000)
28 | 
29 |     # training running:
30 |     train()
31 | 
32 |     timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
33 |     file_name = f"visual_mem_{timestamp}.pickle"
34 |     # save record:
35 |     torch.cuda.memory._dump_snapshot(file_name)
36 | 
37 |     # Stop recording memory snapshot history:
38 |     torch.cuda.memory._record_memory_history(enabled=None)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     run()
43 | 
44 | 


--------------------------------------------------------------------------------
/transformer/fused_softmax/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Fused softmax
 3 | 
 4 | ## Formula： 
 5 | 
 6 | The fuesd opts including：
 7 | * opt1: softmax
 8 | * opt2: scale
 9 | * opt3: mask
10 | 
11 | **1 Softmax**
12 |  
13 | * forward:  yi=e^{xi- max(X)}/\sum_{j=1}^{n}{e^{xj- max(X)}}
14 | * backward: dxi = yi * d yi - yi * \sum_{j=1}^{n}{yj * dyj}  
15 | 
16 | **2 Scale**
17 | 
18 | output = input * scale
19 | 
20 | **3 Mask**
21 | ```textmate
22 | if(mask[i] == 1) 
23 | then 
24 |    val[i] = -VAL 
25 | else 
26 |    do_something
27 | ```
28 | 
29 | input data shape: 
30 | 
31 | [batches, attn_heads, query_seq_len, key_seq_len]
32 | 
33 | 
34 | 
35 | ## Requirements
36 | 
37 | pytorch>=2.0
38 | 
39 | cuda>=11.3
40 | 
41 | hardware: GPU >= volta
42 | 
43 | ## compile
44 | 
45 | ```
46 | python setup.py build
47 | ```
48 | 
49 | ## Running
50 | 
51 | ### Function invoke: 
52 | ```python
53 | import transformer_softmax_lib
54 | # ...
55 | transformer_softmax_lib.scaled_masked_softmax_forward(input_data, mask, scale_factor)
56 | ```
57 | 
58 | ### A test example:
59 | 
60 | note:make sure the .so file is in your running dircetion:
61 | 
62 | ```python
63 | import torch
64 | import transformer_softmax_lib
65 | from torch.autograd import Function
66 | 
67 | class FusedSoftmax(Function):
68 |     @staticmethod
69 |     def forward(ctx, src, mask, scale_factor):
70 | 
71 |         output = transformer_softmax_lib.scaled_masked_softmax_forward(src, mask, scale_factor[0])
72 |         ctx.save_for_backward(output , scale_factor)
73 |         return output
74 | 
75 |     @staticmethod
76 |     def backward(ctx, grad_output):
77 |         src, scale_factor = ctx.saved_tensors
78 |         grad_in = transformer_softmax_lib.scaled_masked_softmax_backward(grad_output, src, scale_factor[0])
79 |         return grad_in, None, None  # 与输入对应上。
80 | 
81 | data_input = torch.randn([1,8,1024,1024], dtype=torch.float16, device='cuda', requires_grad=True)
82 | data_input_check = data_input.clone().detach()
83 | data_input_check.requires_grad_(True)
84 | factor = torch.tensor([1.0], requires_grad=False)
85 | mask = torch.zeros([1,1,1024,1024], dtype=torch.float16, device='cuda', requires_grad=False)
86 | check = torch.softmax(data_input_check, dim=-1)
87 | out_put = FusedSoftmax.apply(data_input, mask, factor)
88 | 
89 | # forward check:
90 | print(torch.allclose(check, out_put, atol=1e-05, rtol=1e-05 )) # fp16 
91 | 
92 | # backward check：
93 | with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=False):
94 |     y=out_put.sum().backward()
95 |     y_check=check.sum().backward()
96 | print(torch.allclose(data_input.grad, data_input_check.grad, atol=1e-05, rtol=1e-05 ))
97 | ```
98 | 


--------------------------------------------------------------------------------
/transformer/fused_softmax/scaled_masked_softmax.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_fp16.h>
 21 | #include <ATen/cuda/CUDAContext.h>
 22 | #include <torch/extension.h>
 23 | #include "scaled_masked_softmax.h"
 24 | #include "utils.h"
 25 | 
 26 | namespace fused_softmax {
 27 | namespace scaled_masked_softmax {
 28 | 
 29 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
 30 |     return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
 31 | }
 32 | 
 33 | 
 34 | torch::Tensor fwd_cuda(
 35 |     torch::Tensor const& input,
 36 |     torch::Tensor const& mask,
 37 |     float scale_factor)
 38 | {
 39 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 40 |   const int batches = input.size(0);
 41 |   const int pad_batches = mask.size(0);
 42 |   const int attn_heads = input.size(1);
 43 |   const int query_seq_len = input.size(2);
 44 |   const int key_seq_len = input.size(3);
 45 |   TORCH_INTERNAL_ASSERT(key_seq_len <= 8192);
 46 |   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
 47 |   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 48 |   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 49 |   TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
 50 |   TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
 51 | 
 52 |   // Output
 53 |   auto act_options = input.options().requires_grad(false);
 54 |   torch::Tensor softmax_results =
 55 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
 56 | 
 57 |   // Softmax Intermediate Result Ptr
 58 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 59 |   void* mask_ptr = static_cast<void*>(mask.data_ptr());
 60 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 61 | 
 62 |   DISPATCH_HALF_AND_BFLOAT(
 63 |       input.scalar_type(),
 64 |       "dispatch_scaled_masked_softmax_forward",
 65 |       dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
 66 |           reinterpret_cast<scalar_t*>(softmax_results_ptr),
 67 |           reinterpret_cast<const scalar_t*>(input_ptr),
 68 |           reinterpret_cast<const uint8_t*>(mask_ptr),
 69 |           scale_factor,
 70 |           query_seq_len,
 71 |           key_seq_len,
 72 |           batches,
 73 |           attn_heads,
 74 |           pad_batches
 75 |       );
 76 |   );
 77 |   return softmax_results;
 78 | }
 79 | 
 80 | torch::Tensor bwd_cuda(
 81 |     torch::Tensor const& output_grads_,
 82 |     torch::Tensor const& softmax_results_,
 83 |     float scale_factor)  {
 84 | 
 85 |   auto output_grads = output_grads_.contiguous();
 86 |   auto softmax_results = softmax_results_.contiguous();
 87 | 
 88 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 89 |   const int batches = output_grads.size(0);
 90 |   const int attn_heads = output_grads.size(1);
 91 |   const int query_seq_len = output_grads.size(2);
 92 |   const int key_seq_len = output_grads.size(3);
 93 | 
 94 |   auto act_options = output_grads.options().requires_grad(false);
 95 |   torch::Tensor input_grads =
 96 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
 97 |   void* input_grads_ptr = static_cast<void*>(input_grads.data_ptr());
 98 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 99 | 
100 |   //Softmax Grad
101 |   DISPATCH_HALF_AND_BFLOAT(
102 |       output_grads_.scalar_type(),
103 |       "dispatch_scaled_masked_softmax_backward",
104 |       dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
105 |           reinterpret_cast<scalar_t*>(input_grads_ptr),
106 |           reinterpret_cast<scalar_t*>(output_grads_ptr),
107 |           reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
108 |           scale_factor,
109 |           query_seq_len,
110 |           key_seq_len,
111 |           batches,
112 |           attn_heads
113 |       );
114 |   );
115 |   return input_grads;
116 | }
117 | }
118 | }
119 | 


--------------------------------------------------------------------------------
/transformer/fused_softmax/setup.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/NVIDIA/apex/tree/master/csrc/megatron
 2 | # create a baseline
 3 | import os
 4 | import subprocess
 5 | 
 6 | from setuptools import setup
 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 8 | 
 9 | # print("enter setup")
10 | 
11 | def get_cuda_bare_metal_version(cuda_dir):
12 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
13 |     output = raw_output.split()
14 |     release_idx = output.index("release") + 1
15 |     release = output[release_idx].split(".")
16 |     bare_metal_major = release[0]
17 |     bare_metal_minor = release[1][0]
18 | 
19 |     return raw_output, bare_metal_major, bare_metal_minor
20 | 
21 | 
22 | def append_nvcc_threads(nvcc_extra_args):
23 |     _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
24 |     if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
25 |         return nvcc_extra_args + ["--threads", "4"]
26 |     return nvcc_extra_args
27 | 
28 | 
29 | cc_flag = []
30 | # Support Volta:
31 | cc_flag.append("-gencode")
32 | cc_flag.append("arch=compute_70,code=sm_70")
33 | # Support Ampere:
34 | cc_flag.append("-gencode")
35 | cc_flag.append("arch=compute_80,code=sm_80")
36 | # Support Hopper:
37 | # cc_flag.append("-gencode")
38 | # cc_flag.append("arch=compute_90,code=sm_90")
39 | 
40 | setup(
41 |     name='transformer_softmax_lib',
42 |     ext_modules=[
43 |         CUDAExtension(
44 |             name='transformer_softmax_lib',
45 |             sources=['torch_interface.cpp', 'scaled_masked_softmax.cu',  ],
46 |             extra_compile_args={
47 |                                'cxx': ['-O3',],
48 |                                'nvcc': append_nvcc_threads(['-O3', '--use_fast_math'] + cc_flag)
49 |                                }
50 |             )
51 |     ],
52 |     cmdclass={
53 |         'build_ext': BuildExtension
54 | })


--------------------------------------------------------------------------------
/transformer/fused_softmax/torch_interface.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | 
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input,
27 |     torch::Tensor const& mask,
28 |     float scale_factor);
29 | 
30 | torch::Tensor bwd_cuda(
31 |     torch::Tensor const& output_grads,
32 |     torch::Tensor const& softmax_results,
33 |     float scale_factor);
34 | 
35 | int get_batch_per_block_cuda(
36 |     int query_seq_len,
37 |     int key_seq_len,
38 |     int batches,
39 |     int attn_heads);
40 | 
41 | torch::Tensor fwd(
42 |     torch::Tensor const& input,
43 |     torch::Tensor const& mask,
44 |     float scale_factor) {
45 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
46 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
47 | 	     (input.scalar_type() == at::ScalarType::BFloat16),
48 |       "Only fp16 and bf16 are supported");
49 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
50 | 
51 |   return fwd_cuda(input, mask, scale_factor);
52 | }
53 | 
54 | torch::Tensor bwd(
55 |     torch::Tensor const& output_grads,
56 |     torch::Tensor const& softmax_results,
57 |     float scale_factor) {
58 | 
59 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
60 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
61 | 
62 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
63 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16),
64 |       "Only fp16 and bf16 are supported");
65 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
66 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16),
67 |       "Only fp16 and bf16 are supported");
68 | 
69 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
70 | }
71 | 
72 | int get_batch_per_block(
73 |     int query_seq_len,
74 |     int key_seq_len,
75 |     int batches,
76 |     int attn_heads) {
77 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
78 | }
79 | 
80 | } // end namespace scaled_masked_softmax
81 | } // end namespace fused_softmax
82 | 
83 | 
84 | 
85 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
86 |   m.def("scaled_masked_softmax_forward",
87 |         &fused_softmax::scaled_masked_softmax::fwd,
88 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
89 | 
90 |   m.def("scaled_masked_softmax_backward",
91 |         &fused_softmax::scaled_masked_softmax::bwd,
92 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
93 | 
94 |   m.def("scaled_masked_softmax_get_batch_per_block",
95 |         &fused_softmax::scaled_masked_softmax::get_batch_per_block,
96 |         "Return Batch per block size."
97 |   );
98 | }


--------------------------------------------------------------------------------
/transformer/fused_softmax/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <assert.h>
 3 | #include <cuda_fp16.h>
 4 | #include <cfloat>
 5 | #include <limits>
 6 | #include <stdint.h>
 7 | #include <cuda_bf16.h>
 8 | #include <c10/macros/Macros.h>
 9 | #include <ATen/ATen.h>
10 | 
11 | #include <cuda_fp16.h>
12 | // #define warpSize    32
13 | 
14 | // ELEMENTS_PER_LDG = 4, using float2 copy 4 half data. half2 copy 4 uint8_t
15 | template <typename Datatype, int ELEMENTS_PER_LDG>
16 | __device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
17 | 
18 | template <>
19 | __device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
20 | 
21 | template <>
22 | __device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
23 | 
24 | template <>
25 | __device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
26 | 
27 | template <>
28 | __device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
29 | 
30 | template <>
31 | __device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
32 | 
33 | template <>
34 | __device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
35 | 
36 | int log2_ceil(int value) {
37 |     int log2_value = 0;
38 |     while ((1 << log2_value) < value) ++log2_value;
39 |     return log2_value;
40 | }
41 | 
42 | template<typename T>
43 | struct Add {
44 |   __device__ __forceinline__ T operator()(T a, T b) const {
45 |     return a + b;
46 |   }
47 | };
48 | 
49 | template<typename T>
50 | struct Max {
51 |   __device__ __forceinline__ T operator()(T a, T b) const {
52 |     return a < b ? b : a;
53 |   }
54 | };
55 | 
56 | template <typename T>
57 | __device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
58 | {
59 | #if CUDA_VERSION >= 9000
60 |     return __shfl_xor_sync(mask, value, laneMask, width);
61 | #else
62 |     return __shfl_xor(value, laneMask, width);
63 | #endif
64 | }
65 | 
66 | template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
67 | __device__ __forceinline__ void warp_reduce(acc_t* sum) {
68 |     ReduceOp<acc_t> r;
69 |     #pragma unroll
70 |     for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
71 |         #pragma unroll
72 |         for (int i = 0;  i < WARP_BATCH;  ++i) {
73 |             acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
74 |             sum[i] = r(sum[i], b);
75 |         }
76 |     }
77 | }
78 | 
79 | // using: DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, function(parameters...))
80 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                   \
81 | switch(TYPE)                                                        \
82 | {                                                                   \
83 | case at::ScalarType::Half:                                          \
84 |     {                                                               \
85 | using scalar_t = at::Half;                                          \
86 | __VA_ARGS__;                                                        \
87 | break;                                                              \
88 |     }                                                               \
89 | case at::ScalarType::BFloat16:                                      \
90 |     {                                                               \
91 | using scalar_t = at::BFloat16;                                      \
92 | __VA_ARGS__;                                                        \
93 | break;                                                              \
94 |     }                                                               \
95 | default:                                                            \
96 |     AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
97 | }
98 | 


--------------------------------------------------------------------------------
/transformer/fused_softmax/warp_example/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | An example shows how to use __shfl_xor_sync to get summary.
 3 | 
 4 | Compile:
 5 | ```bash
 6 | nvcc -lcuda warp_reduce.cu -o test
 7 | ```
 8 | 
 9 | Run：
10 | ```
11 | ./test
12 | ```


--------------------------------------------------------------------------------
/transformer/fused_softmax/warp_example/warp_reduce.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  warp reduce example.
 3 |  *  Author: kevin.xie
 4 |  *  Email: kaiyuanxie@yeah.net
 5 |  * */
 6 | 
 7 | #include <cstdio>
 8 | #include <cuda_fp16.h>
 9 | #include <cuda_runtime.h>
10 | 
11 | 
12 | template <typename T> void check(T result, char const *const func, const char *const file, int const line)
13 | {
14 |     if (result) {
15 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
16 |                 cudaGetErrorString(result), func);
17 |         exit(EXIT_FAILURE);
18 |     }
19 | }
20 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
21 | 
22 | template<typename T>
23 | struct Add {
24 |   __device__ __forceinline__ T operator()(T a, T b) const {
25 |     return a + b;
26 |   }
27 | };
28 | 
29 | template<typename T>
30 | struct Max {
31 |   __device__ __forceinline__ T operator()(T a, T b) const {
32 |     return a < b ? b : a;
33 |   }
34 | };
35 | 
36 | template <typename T>
37 | __device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
38 | {
39 | #if CUDA_VERSION >= 9000
40 |     return __shfl_xor_sync(mask, value, laneMask, width);
41 | #else
42 |     return __shfl_xor_sync(mask, value, laneMask, width);
43 |     //return __shfl_xor(value, laneMask, width);
44 | #endif
45 | }
46 | 
47 | template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
48 | __device__ __forceinline__ void warp_reduce(acc_t* sum) {
49 |     ReduceOp<acc_t> r;
50 |     #pragma unroll
51 |     for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
52 |         #pragma unroll
53 |         for (int i = 0;  i < WARP_BATCH;  ++i) {
54 |             acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
55 |             sum[i] = r(sum[i], b);
56 |         }
57 |     }
58 | }
59 | 
60 | #define WARP_BATCH 1
61 | 
62 | template<typename data_t>
63 | __global__ void launcher(data_t* src, int nums) {
64 |     data_t tmp[WARP_BATCH] = {0};
65 |     int localIdx= threadIdx.x;
66 |     while (localIdx < nums) {
67 |         tmp[0] += src[localIdx];
68 |         localIdx += gridDim.x * blockDim.x;
69 |     }
70 | 
71 |     warp_reduce<data_t, WARP_BATCH, 32, Add>(tmp);
72 |     src[threadIdx.x] = tmp[0];
73 | }
74 | 
75 | int main() {
76 |     unsigned int total_size = 100;
77 |     float* input_data = (float*) malloc(sizeof(float) * total_size);
78 |     float* device_ptr;
79 |     checkCudaErrors(cudaMalloc((void**)&device_ptr, sizeof(float) *total_size));
80 |     for (int i =0; i < 90; ++i) {
81 |         input_data[i] = i * 2;
82 |     }
83 |     checkCudaErrors(cudaMemcpy(device_ptr, input_data,  total_size * sizeof(float), cudaMemcpyHostToDevice));
84 |     launcher<float><<<1, 32>>>(device_ptr, total_size);
85 |     checkCudaErrors(cudaMemcpy(input_data, device_ptr, total_size * sizeof(float), cudaMemcpyDeviceToHost));
86 |     printf("Print all data:\n");
87 |     for (int i = 0;i < 10; ++i) {
88 |         for (int k = 0; k < 10 ;++k) {
89 |             printf("%f " ,input_data[i*10 + k]);
90 |         }
91 |         printf("\n");
92 |     }
93 |     checkCudaErrors(cudaFree (device_ptr));
94 |     free(input_data);
95 |     return 0;
96 | }


--------------------------------------------------------------------------------