├── Chapter09
├── solution.cu
├── Unified_normal.cu
├── Unified_prefetch.cu
└── README.md
├── Chapter07
├── local_memory.cu
├── global_memory.cu
├── constant_memory.cu
└── README.md
├── Chapter12
├── data_hazard.cu
└── README.md
├── Chapter14
├── shared.cu
└── README.md
├── Chapter13
├── README.md
└── atomic.cu
├── Chapter10
├── pinned_memory.cu
├── solution.cu
└── README.md
├── Chapter11
├── streamV2.cu
├── streaming.cu
└── README.md
├── Chapter04
├── solution.cu
└── README.md
├── Chapter08
└── README.md
├── README.md
├── Chapter01
└── README.md
├── Chapter02
└── README.md
├── Chapter06
└── README.md
├── Chapter03
└── README.md
└── Chapter05
└── README.md
/Chapter09/solution.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __managed__ int y=2;
4 | __global__ void kernel() {
5 | printf("%d\n", y);
6 | }
7 | int main() {
8 | kernel<<< 1, 1 >>>();
9 |
10 | // Error on some GPUs, all CC < 6.0
11 | cudaDeviceSynchronize();
12 | y =20;
13 | printf("%d\n", y);
14 | return 0;
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/Chapter07/local_memory.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void kernel() {
5 | int temp = 0;
6 | temp = threadIdx.x;
7 |
8 | printf("blockId %d ThreadIdx %d = %d\n",blockIdx.x,threadIdx.x,temp);
9 |
10 | }
11 |
12 | int main() {
13 | kernel<<<5,5>>>();
14 | cudaDeviceSynchronize();
15 |
16 | return 0;
17 | }
18 |
19 |
20 |
--------------------------------------------------------------------------------
/Chapter12/data_hazard.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define ARRAY_SIZE 4
5 |
6 |
7 | __global__ void sum(int *d_array)
8 | {
9 | int id = blockIdx.x * blockDim.x + threadIdx.x;
10 |
11 | for (int stride = 1; stride < 4; stride *= 2)
12 | {
13 | __syncthreads();
14 |
15 | if (threadIdx.x % (2 * stride) == 0)
16 | {
17 | d_array[id] += d_array[id + stride];
18 | }
19 | }
20 | printf("blockIdx.x=%d --> %d\n", blockIdx.x, d_array[id]);
21 | }
22 |
23 | int main()
24 | {
25 | int h_array[4] = {1, 2, 3, 4};
26 | int *d_array;
27 |
28 | cudaMalloc((void **)&d_array, sizeof(int) * ARRAY_SIZE);
29 | cudaMemcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE, cudaMemcpyHostToDevice);
30 |
31 | sum<<<1, 4>>>(d_array);
32 |
33 | cudaFree(d_array);
34 |
35 | return 0;
36 | }
37 |
38 |
--------------------------------------------------------------------------------
/Chapter09/Unified_normal.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | using std::cout;
8 |
9 |
10 | __global__ void vectorAdd(int *a, int *b, int *c, int N)
11 | {
12 |
13 | int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
14 |
15 | if (tid < N)
16 | {
17 | c[tid] = a[tid] + b[tid];
18 | }
19 | }
20 |
21 | int main()
22 | {
23 |
24 | const int N = 1 << 16;
25 | size_t bytes = N * sizeof(int);
26 |
27 | int *a, *b, *c;
28 |
29 | cudaMallocManaged(&a, bytes);
30 | cudaMallocManaged(&b, bytes);
31 | cudaMallocManaged(&c, bytes);
32 |
33 |
34 |
35 |
36 | for (int i = 0; i < N; i++)
37 | {
38 | a[i] = rand() % 100;
39 | b[i] = rand() % 100;
40 | }
41 |
42 | int BLOCK_SIZE = 1 << 10;
43 |
44 | int GRID_SIZE = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
45 |
46 | vectorAdd<<>>(a, b, c, N);
47 |
48 | cudaDeviceSynchronize();
49 |
50 | for (int i = 0; i < N; i++)
51 | {
52 | assert(c[i] == a[i] + b[i]);
53 | }
54 |
55 |
56 | cudaFree(a);
57 | cudaFree(b);
58 | cudaFree(c);
59 |
60 | cout << "COMPLETED SUCCESSFULLY!\n";
61 |
62 | return 0;
63 | }
64 |
--------------------------------------------------------------------------------
/Chapter14/shared.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __global__ void staticReverse(int *d, int n)
4 | {
5 | __shared__ int s[64];
6 | int t = threadIdx.x;
7 | int tr = n-t-1;
8 | s[t] = d[t];
9 | __syncthreads();
10 | d[t] = s[tr];
11 | }
12 |
13 | __global__ void dynamicReverse(int *d, int n)
14 | {
15 | extern __shared__ int s[];
16 | int t = threadIdx.x;
17 | int tr = n-t-1;
18 | s[t] = d[t];
19 | __syncthreads();
20 | d[t] = s[tr];
21 | }
22 |
23 | int main(void)
24 | {
25 | const int n = 64;
26 | int a[n], r[n], d[n];
27 |
28 | for (int i = 0; i < n; i++) {
29 | a[i] = i;
30 | r[i] = n-i-1;
31 | d[i] = 0;
32 | }
33 |
34 | int *d_d;
35 | cudaMalloc(&d_d, n * sizeof(int));
36 |
37 | // run version with static shared memory
38 | cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
39 | staticReverse<<<1,n>>>(d_d, n);
40 | cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
41 | for (int i = 0; i < n; i++)
42 | if (d[i] != r[i]) printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]);
43 |
44 | // run dynamic shared memory version
45 | cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
46 | dynamicReverse<<<1,n,n*sizeof(int)>>>(d_d, n);
47 | cudaMemcpy(d, d_d, n * sizeof(int), cudaMemcpyDeviceToHost);
48 | for (int i = 0; i < n; i++)
49 | if (d[i] != r[i]) printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]);
50 | }
51 |
--------------------------------------------------------------------------------
/Chapter07/global_memory.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | // Size of the vector
5 | #define N 100
6 |
7 | // CUDA kernel to add two vectors
8 | __global__ void vectorAdd(int *a, int *b, int *c) {
9 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
10 | if (tid < N) {
11 | c[tid] = a[tid] + b[tid];
12 | }
13 | }
14 |
15 | int main() {
16 | int *h_a, *h_b, *h_c; // Host vectors
17 | int *d_a, *d_b, *d_c; // Device vectors
18 |
19 | // Initialize host vectors
20 | h_a = (int *)malloc(N * sizeof(int));
21 | h_b = (int *)malloc(N * sizeof(int));
22 | h_c = (int *)malloc(N * sizeof(int));
23 |
24 | // Initialize host vectors with random values
25 | for (int i = 0; i < N; i++) {
26 | h_a[i] = rand() % 10;
27 | h_b[i] = rand() % 10;
28 | }
29 |
30 | // Allocate device memory for vectors
31 | cudaMalloc((void **)&d_a, N * sizeof(int));
32 | cudaMalloc((void **)&d_b, N * sizeof(int));
33 | cudaMalloc((void **)&d_c, N * sizeof(int));
34 |
35 | // Copy data from CPU to GPU
36 | cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
37 | cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
38 |
39 | // Call the CUDA kernel to perform vector addition
40 | vectorAdd<<<2, 50>>>(d_a, d_b, d_c);
41 |
42 | // Copy the result from GPU to CPU
43 | cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
44 |
45 | // Print the result
46 | for (int i = 0; i < N; i++) {
47 | printf("h_a[%d] %d + h_b[%d] %d = %d\n", i, h_a[i], i, h_b[i], h_c[i]);
48 | }
49 |
50 | // Free memory
51 | free(h_a);
52 | free(h_b);
53 | free(h_c);
54 | cudaFree(d_a);
55 | cudaFree(d_b);
56 | cudaFree(d_c);
57 |
58 | return 0;
59 | }
60 |
--------------------------------------------------------------------------------
/Chapter07/constant_memory.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __constant__ int constantData[2]; // Declaration of Constant memory array
4 |
5 | __global__ void kernel(int *d_x, int *d_y, int N) {
6 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
7 |
8 | if (idx < N) {
9 | int x = d_x[idx];
10 | int a = constantData[0]; // Retrieve the value 3 from Constant memory
11 | int b = constantData[1]; // Retrieve the value 5 from Constant memory
12 | d_y[idx] = a * x + b;
13 | }
14 | }
15 |
16 | int main() {
17 | const int N = 10; // Number of array elements
18 | int h_x[N]; // Input array on the host
19 | int h_y[N]; // Result array on the host
20 | int *d_x, *d_y; // Arrays on the device
21 |
22 | // Initialize data on the host
23 | for (int i = 0; i < N; i++) {
24 | h_x[i] = i;
25 | }
26 |
27 | // Allocate memory for arrays on the GPU
28 | cudaMalloc((void**)&d_x, N * sizeof(int));
29 | cudaMalloc((void**)&d_y, N * sizeof(int));
30 |
31 | // Copy data from host to device
32 | cudaMemcpy(d_x, h_x, N * sizeof(int), cudaMemcpyHostToDevice);
33 |
34 | // Copy the values 3 and 5 into Constant memory
35 | int constantValues[2] = {3, 5};
36 | cudaMemcpyToSymbol(constantData, constantValues, 2 * sizeof(int));
37 |
38 | // Launch the kernel with 1 block and N threads
39 | kernel<<<1, N>>>(d_x, d_y, N);
40 | cudaDeviceSynchronize();
41 |
42 | // Copy the results from the device to the host
43 | cudaMemcpy(h_y, d_y, N * sizeof(int), cudaMemcpyDeviceToHost);
44 |
45 | // Print the results
46 | for (int i = 0; i < N; i++) {
47 | printf("3(x= %d) + 5 => y = %d\n", h_x[i], h_y[i]);
48 | }
49 |
50 | // Free memory on the device
51 | cudaFree(d_x);
52 | cudaFree(d_y);
53 |
54 | return 0;
55 | }
56 |
--------------------------------------------------------------------------------
/Chapter13/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | In this article, I will introduce to you a very useful built-in function in CUDA. It's important to read previous articles about [Data Hazard](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter12) and [Synchronization - Asynchronization](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter08) before diving into this one.
4 |
5 |
6 |
7 |
Atomic Function
8 |
9 |
10 | This library is quite simple to use. The purpose behind NVIDIA developing this library is to avoid data hazards or, in other words, to synchronize threads during the processing of operations.
11 |
12 | For example, consider a simple code snippet:
13 |
14 | ```
15 | for(int i=0;i
2 | #include
3 |
4 |
5 | using std::cout;
6 | using std::end;
7 |
8 |
9 | __global__ void vectorAdd(int *a, int *b, int *c, int N)
10 | {
11 | int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
12 |
13 | if (tid < N)
14 | {
15 | c[tid] = a[tid] + b[tid];
16 | }
17 | }
18 |
19 | void verify_result(int *a, int *b, int *c, int N)
20 | {
21 | for (int i = 0; i < N; i++)
22 | {
23 | assert(c[i] == a[i] + b[i]);
24 | }
25 | }
26 |
27 | int main()
28 | {
29 | constexpr int N = 100;
30 | size_t bytes = sizeof(int) * N;
31 |
32 | // Vectors for holding the host-side (CPU-side) data
33 | int *h_a, *h_b, *h_c;
34 |
35 | // Allocate pinned memory
36 | cudaMallocHost(&h_a, bytes);
37 | cudaMallocHost(&h_b, bytes);
38 | cudaMallocHost(&h_c, bytes);
39 |
40 | for (int i = 0; i < N; i++)
41 | {
42 | h_a[i] = rand() % 100;
43 | h_b[i] = rand() % 100;
44 | }
45 |
46 | // Allocate memory on the device
47 | int *d_a, *d_b, *d_c;
48 | cudaMalloc(&d_a, bytes);
49 | cudaMalloc(&d_b, bytes);
50 | cudaMalloc(&d_c, bytes);
51 |
52 |
53 |
54 | // Copy data from the host to the device (CPU -> GPU)
55 | cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
56 | cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
57 |
58 |
59 |
60 |
61 | int NUM_THREADS = 1 << 10;
62 | int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
63 |
64 |
65 |
66 | // Execute the kernel
67 | vectorAdd<<>>(d_a, d_b, d_c, N);
68 |
69 |
70 |
71 |
72 |
73 | // Copy data from device to host (GPU -> CPU)
74 | cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
75 |
76 |
77 |
78 |
79 | // Check result for errors
80 | verify_result(h_a, h_b, h_c, N);
81 |
82 | // Free pinned memory
83 | cudaFreeHost(h_a);
84 | cudaFreeHost(h_b);
85 | cudaFreeHost(h_c);
86 |
87 | // Free memory on device
88 | cudaFree(d_a);
89 | cudaFree(d_b);
90 | cudaFree(d_c);
91 |
92 | cout << "COMPLETED SUCCESSFULLY\n";
93 |
94 | return 0;
95 | }
96 |
--------------------------------------------------------------------------------
/Chapter09/Unified_prefetch.cu:
--------------------------------------------------------------------------------
1 | // //todo: unified memory prefetch
2 | //! unified memory la async ?
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | using std::cout;
9 |
10 |
11 | __global__ void vectorAdd(int *a, int *b, int *c, int N)
12 | {
13 |
14 | int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
15 |
16 | if (tid < N)
17 | {
18 | c[tid] = a[tid] + b[tid];
19 | }
20 | }
21 |
22 | int main()
23 | {
24 | const int N = 1 << 16;
25 | size_t bytes = N * sizeof(int);
26 |
27 | int *a, *b, *c;
28 |
29 | cudaMallocManaged(&a, bytes);
30 | cudaMallocManaged(&b, bytes);
31 | cudaMallocManaged(&c, bytes);
32 |
33 | // Get the device ID for prefetching calls
34 | int id = cudaGetDevice(&id);
35 |
36 | // Set some hints about the data and do some prefetching
37 | cudaMemAdvise(a, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
38 | cudaMemAdvise(b, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
39 | cudaMemPrefetchAsync(c, bytes, id);
40 |
41 | // Initialize vectors
42 | for (int i = 0; i < N; i++)
43 | {
44 | a[i] = rand() % 100;
45 | b[i] = rand() % 100;
46 | }
47 |
48 | // Pre-fetch 'a' and 'b' arrays to the specified device (GPU)
49 | cudaMemAdvise(a, bytes, cudaMemAdviseSetReadMostly, id);
50 | cudaMemAdvise(b, bytes, cudaMemAdviseSetReadMostly, id);
51 | cudaMemPrefetchAsync(a, bytes, id);
52 | cudaMemPrefetchAsync(b, bytes, id);
53 |
54 |
55 | int BLOCK_SIZE = 1 << 10;
56 |
57 | int GRID_SIZE = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
58 |
59 | vectorAdd<<>>(a, b, c, N);
60 |
61 |
62 | cudaDeviceSynchronize();
63 |
64 |
65 | // Prefetch to the host (CPU)
66 | cudaMemPrefetchAsync(a, bytes, cudaCpuDeviceId);
67 | cudaMemPrefetchAsync(b, bytes, cudaCpuDeviceId);
68 | cudaMemPrefetchAsync(c, bytes, cudaCpuDeviceId);
69 |
70 | // Verify the result on the CPU
71 | for (int i = 0; i < N; i++)
72 | {
73 | assert(c[i] == a[i] + b[i]);
74 | }
75 |
76 | // Free unified memory (same as memory allocated with cudaMalloc)
77 | cudaFree(a);
78 | cudaFree(b);
79 | cudaFree(c);
80 |
81 | cout << "COMPLETED SUCCESSFULLY!\n";
82 |
83 | return 0;
84 | }
85 |
--------------------------------------------------------------------------------
/Chapter10/solution.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define SIZE (16 * 1024 * 1024)
5 |
6 |
7 |
8 | void pageableMemoryTest() {
9 | float *h_data, *d_data;
10 | h_data = (float *)malloc(SIZE * sizeof(float));
11 | cudaMalloc((void **)&d_data, SIZE * sizeof(float));
12 |
13 | cudaEvent_t start, stop;
14 | cudaEventCreate(&start);
15 | cudaEventCreate(&stop);
16 |
17 | // Host to Device
18 | cudaEventRecord(start);
19 | cudaMemcpy(d_data, h_data, SIZE * sizeof(float), cudaMemcpyHostToDevice);
20 | cudaEventRecord(stop);
21 | cudaEventSynchronize(stop);
22 | float milliseconds = 0;
23 | cudaEventElapsedTime(&milliseconds, start, stop);
24 | printf("Pageable - Host to Device: %f ms\n", milliseconds);
25 |
26 | // Device to Host
27 | cudaEventRecord(start);
28 | cudaMemcpy(h_data, d_data, SIZE * sizeof(float), cudaMemcpyDeviceToHost);
29 | cudaEventRecord(stop);
30 | cudaEventSynchronize(stop);
31 | cudaEventElapsedTime(&milliseconds, start, stop);
32 | printf("Pageable - Device to Host: %f ms\n", milliseconds);
33 |
34 | free(h_data);
35 | cudaFree(d_data);
36 | }
37 |
38 | void pinnedMemoryTest() {
39 | float *h_data, *d_data;
40 | cudaMallocHost((void **)&h_data, SIZE * sizeof(float));
41 | cudaMalloc((void **)&d_data, SIZE * sizeof(float));
42 |
43 | cudaEvent_t start, stop;
44 | cudaEventCreate(&start);
45 | cudaEventCreate(&stop);
46 |
47 | // Host to Device
48 | cudaEventRecord(start);
49 | cudaMemcpy(d_data, h_data, SIZE * sizeof(float), cudaMemcpyHostToDevice);
50 | cudaEventRecord(stop);
51 | cudaEventSynchronize(stop);
52 | float milliseconds = 0;
53 | cudaEventElapsedTime(&milliseconds, start, stop);
54 | printf("Pinned - Host to Device: %f ms\n", milliseconds);
55 |
56 | // Device to Host
57 | cudaEventRecord(start);
58 | cudaMemcpy(h_data, d_data, SIZE * sizeof(float), cudaMemcpyDeviceToHost);
59 | cudaEventRecord(stop);
60 | cudaEventSynchronize(stop);
61 | cudaEventElapsedTime(&milliseconds, start, stop);
62 | printf("Pinned - Device to Host: %f ms\n", milliseconds);
63 |
64 | cudaFreeHost(h_data);
65 | cudaFree(d_data);
66 | }
67 |
68 | int main() {
69 | printf("Running pageable memory test...\n");
70 | pageableMemoryTest();
71 |
72 | printf("\nRunning pinned memory test...\n");
73 | pinnedMemoryTest();
74 |
75 | return 0;
76 | }
77 |
78 |
--------------------------------------------------------------------------------
/Chapter11/streamV2.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | // Convenience function for checking CUDA runtime API results
4 | // can be wrapped around any runtime API call. No-op in release builds.
5 | inline cudaError_t checkCuda(cudaError_t result)
6 | {
7 | #if defined(DEBUG) || defined(_DEBUG)
8 | if (result != cudaSuccess)
9 | {
10 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
11 | assert(result == cudaSuccess);
12 | }
13 | #endif
14 | return result;
15 | }
16 |
17 | __global__ void kernel(float *a, int offset)
18 | {
19 | int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
20 | float x = (float)i;
21 | float s = sinf(x);
22 | float c = cosf(x);
23 | a[i] = a[i] + sqrtf(s * s + c * c);
24 | }
25 |
26 | float maxError(float *a, int n)
27 | {
28 | float maxE = 0;
29 | for (int i = 0; i < n; i++)
30 | {
31 | float error = fabs(a[i] - 1.0f);
32 | if (error > maxE)
33 | maxE = error;
34 | }
35 | return maxE;
36 | }
37 |
38 | int main()
39 | {
40 | const int blockSize = 768, nStreams = 4;
41 | const int n = 256 * 256;
42 | const int streamSize = n / nStreams;
43 | const int streamBytes = streamSize * sizeof(float);
44 | const int bytes = n * sizeof(float);
45 | float ms;
46 |
47 | // Host array
48 | float *a;
49 | cudaMallocHost((void **)&a, bytes);
50 | memset(a, 0, bytes);
51 |
52 | // Device pointer array
53 | float **d_a = (float **)malloc(nStreams * sizeof(float *));
54 | cudaStream_t stream[nStreams];
55 |
56 | cudaEvent_t startEvent, stopEvent;
57 | checkCuda(cudaEventCreate(&startEvent));
58 | checkCuda(cudaEventCreate(&stopEvent));
59 |
60 | for (int i = 0; i < nStreams; ++i)
61 | {
62 | cudaStreamCreate(&stream[i]);
63 | int offset = i * streamSize;
64 |
65 | checkCuda(cudaMallocAsync((void **)&d_a[i], streamBytes, stream[i])); // Allocate memory on the device asynchronously
66 |
67 | checkCuda(cudaMemcpyAsync(d_a[i], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]));
68 |
69 | kernel<<>>(*d_a, offset);
70 |
71 | checkCuda(cudaMemcpyAsync(&a[offset], d_a[i], streamBytes, cudaMemcpyDeviceToHost, stream[i]));
72 | }
73 |
74 | checkCuda(cudaEventRecord(stopEvent, 0));
75 | checkCuda(cudaEventSynchronize(stopEvent));
76 | checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
77 | printf("Time for sequential transfer and execute (ms): %f\n", ms);
78 | printf(" max error: %e\n", maxError(a, n));
79 |
80 | // Synchronize and clean up
81 | checkCuda(cudaEventDestroy(startEvent));
82 | checkCuda(cudaEventDestroy(stopEvent));
83 | for (int i = 0; i < nStreams; ++i)
84 | checkCuda(cudaStreamDestroy(stream[i]));
85 |
86 | cudaFree(d_a);
87 | cudaFreeHost(a);
88 |
89 | return 0;
90 | }
--------------------------------------------------------------------------------
/Chapter14/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | In this article, I will introduce you to how to use shared memory on the GPU using CUDA. Before reading this article, please take a look at the [Memory Types in GPU](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter06)
4 |
5 |
6 |
Shared memory
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | Shared memory is the fastest memory (only after the register file) in the GPU, and the scope of access to shared memory is the threads within the same block.
16 |
17 | Whenever copying data from global to shared memory, we must use __syncthreads() to synchronize the threads within the same block to **avoid race conditions**. This is because while **threads in a block run logically in parallel, not all threads can execute physically at the same time.**
18 |
19 | You can refer to these two articles to understand better:
20 | - [Data Hazard](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter12)
21 |
22 | - [Synchronization - Asynchronization](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter08)
23 |
24 |
25 |
26 |
Here is the process of data going from global to shared memory
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | `
35 | In this article, I will focus on the concept and usage of shared memory. In future articles, I will discuss techniques to improve and optimize the use of shared memory
36 | `
37 |
38 |
39 |
Code
40 |
41 |
42 | We are already familiar with the concepts of static and dynamic memory, and in shared memory, we also have these concepts.
43 |
44 | ```
45 | __global__ void staticReverse(int *data, int n)
46 | {
47 | __shared__ int s[64];
48 | int t = threadIdx.x;
49 | int tr = n-t-1;
50 | s[t] = data[t];
51 | __syncthreads();
52 | data[t] = s[tr];
53 | }
54 | ```
55 |
56 | ```
57 | __global__ void dynamicReverse(int *data, int n)
58 | {
59 | extern __shared__ int s[];
60 | int t = threadIdx.x;
61 | int tr = n-t-1;
62 | s[t] = data[t];
63 | __syncthreads();
64 | data[t] = s[tr];
65 | }
66 | ```
67 |
68 | In this problem, we have two steps:
69 |
70 | - Copy data from global to shared memory in ascending order.
71 | - Copy data from shared memory back to global memory in descending order.
72 |
73 | ```
74 | staticReverse<<<1,n>>>(d_data, n);
75 | dynamicReverse<<<1,n,n*sizeof(int)>>>(d_data, n);
76 | ```
77 |
78 | In <<>>
79 |
80 | - a: the number of blocks
81 | - b: the number of threads per block
82 | - c: the size of shared memory
83 | - d: the number of streams
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/Chapter13/atomic.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define NUM_THREADS 10
4 | #define NUM_BLOCKS 2
5 | #define ARRAY_SIZE 20
6 |
7 | __global__ void AtomicAdd(int *result, int *array_add)
8 | {
9 | int tid = threadIdx.x + blockDim.x * blockIdx.x;
10 |
11 | atomicAdd(result, array_add[tid]);
12 |
13 | // if (threadIdx.x == 0)
14 | // {
15 | // atomicAdd(result, array_add[tid]);
16 | // }
17 | }
18 |
19 | __global__ void AtomicSub(int *result, int *array_sub)
20 | {
21 | int tid = threadIdx.x + blockDim.x * blockIdx.x;
22 |
23 | atomicSub(result, array_sub[tid]);
24 |
25 | // if (threadIdx.x == 0)
26 | // {
27 | // atomicSub(result, array_sub[tid]);
28 | // }
29 | }
30 |
31 | __global__ void AtomicMax(int *result, int *array_max)
32 | {
33 | int tid = threadIdx.x + blockDim.x * blockIdx.x;
34 |
35 | atomicMax(result, array_max[tid]);
36 |
37 | // if (threadIdx.x == 0)
38 | // {
39 | // atomicMax(result, array_max[tid]);
40 | // }
41 | }
42 |
43 | __global__ void AtomicMin(int *result, int *array_min)
44 | {
45 | int tid = threadIdx.x + blockDim.x * blockIdx.x;
46 |
47 | atomicMin(result, array_min[tid]);
48 |
49 | // if (threadIdx.x == 0)
50 | // {
51 | // atomicMin(result, array_min[tid]);
52 | // }
53 | }
54 |
55 | int main()
56 | {
57 | int *h_data = (int *)malloc(ARRAY_SIZE * sizeof(int));
58 | int *d_data;
59 | cudaMalloc((void **)&d_data, ARRAY_SIZE * sizeof(int));
60 |
61 | for (int i = 0; i < ARRAY_SIZE; i++)
62 | {
63 | h_data[i] = i;
64 | }
65 |
66 | //------------ atomicAdd-------------
67 | int *d_result_add;
68 | cudaMalloc((void **)&d_result_add, sizeof(int));
69 | int h_result_add = 0;
70 |
71 | //------------ atomicSub-------------
72 | int *d_result_sub;
73 | cudaMalloc((void **)&d_result_sub, sizeof(int));
74 | int h_result_sub = 0;
75 |
76 | //------------ atomicMax-------------
77 | int *d_result_max;
78 | cudaMalloc((void **)&d_result_max, sizeof(int));
79 | int h_result_max = 0;
80 |
81 | //------------ atomicMin-------------
82 | int *d_result_min;
83 | cudaMalloc((void **)&d_result_min, sizeof(int));
84 | int h_result_min = 0;
85 |
86 | cudaMemcpy(d_data, h_data, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
87 |
88 | AtomicAdd<<>>(d_result_add, d_data);
89 | AtomicSub<<>>(d_result_sub, d_data);
90 | AtomicMax<<>>(d_result_max, d_data);
91 | AtomicMin<<>>(d_result_min, d_data);
92 |
93 | cudaMemcpy(&h_result_add, d_result_add, sizeof(int), cudaMemcpyDeviceToHost);
94 | cudaMemcpy(&h_result_sub, d_result_sub, sizeof(int), cudaMemcpyDeviceToHost);
95 | cudaMemcpy(&h_result_max, d_result_max, sizeof(int), cudaMemcpyDeviceToHost);
96 | cudaMemcpy(&h_result_min, d_result_min, sizeof(int), cudaMemcpyDeviceToHost);
97 |
98 | printf("Atomic Add Result: %d\n", h_result_add);
99 | printf("Atomic Sub Result: %d\n", h_result_sub);
100 | printf("Atomic Max Result: %d\n", h_result_max);
101 | printf("Atomic Min Result: %d\n", h_result_min);
102 |
103 | free(h_data);
104 |
105 | cudaFree(d_result_add);
106 | cudaFree(d_result_sub);
107 | cudaFree(d_result_max);
108 | cudaFree(d_result_min);
109 | cudaFree(d_data);
110 |
111 | return 0;
112 | }
113 |
--------------------------------------------------------------------------------
/Chapter04/solution.cu:
--------------------------------------------------------------------------------
1 | // host call global
2 | // global call device
3 |
4 | #include
5 |
6 | __device__ void Device1()
7 | {
8 | printf("Device1\n");
9 | }
10 |
11 | __device__ void Device2()
12 | {
13 | printf("Device2");
14 | }
15 |
16 | __global__ void kernel()
17 | {
18 | Device1();
19 | Device2();
20 | }
21 |
22 | void sub_Function_in_Host()
23 | {
24 | kernel<<<1, 1>>>();
25 | cudaDeviceSynchronize();
26 | }
27 |
28 | int main()
29 | {
30 | sub_Function_in_Host();
31 | return 0;
32 | }
33 |
34 | // Device1
35 | // Device2
36 |
37 | //-----------------------------------------------------------------------
38 | // host call device
39 | #include
40 |
41 | __device__ void Device1()
42 | {
43 | printf("Device1\n");
44 | }
45 |
46 | __device__ void Device2()
47 | {
48 | printf("Device2");
49 | }
50 |
51 | void sub_Function_in_Host()
52 | {
53 | Device1();
54 | }
55 |
56 | int main()
57 | {
58 | sub_Function_in_Host();
59 | Device2();
60 | cudaDeviceSynchronize();
61 | return 0;
62 | }
63 |
64 | // error: calling a __device__ function("Device1") from a __host__ function("sub_Function_in_Host") is not allowed
65 | // error: calling a __device__ function("Device2") from a __host__ function("main") is not allowed
66 |
67 | //---------------------------------------------------------------------
68 | // device call host
69 | #include
70 |
71 | void sub_Function_in_Host()
72 | {
73 | printf("host function");
74 | }
75 |
76 | __device__ void Device1()
77 | {
78 | sub_Function_in_Host();
79 | }
80 |
81 | int main()
82 | {
83 | Device1();
84 | cudaDeviceSynchronize();
85 | return 0;
86 | }
87 |
88 | // error: calling a __host__ function("sub_Function_in_Host") from a __device__ function("Device1") is not allowed
89 | // error: identifier "sub_Function_in_Host" is undefined in device code
90 |
91 | //------------------------------------------------------------
92 | // global call host
93 |
94 | #include
95 |
96 | void sub_Function_in_Host()
97 | {
98 | printf("host function");
99 | }
100 |
101 | __global__ void kernel()
102 | {
103 | sub_Function_in_Host();
104 | }
105 |
106 | int main()
107 | {
108 | kernel<<<1, 1>>>();
109 | cudaDeviceSynchronize();
110 | return 0;
111 | }
112 |
113 | // error: calling a __host__ function("sub_Function_in_Host") from a __global__ function("kernel") is not allowed
114 | // error: identifier "sub_Function_in_Host" is undefined in device code
115 |
116 | //-----------------------------------------------------------------
117 | // device call global
118 |
119 | #include
120 |
121 | __global__ void kernel()
122 | {
123 | printf("kernel function");
124 | }
125 |
126 | __device__ void Device1()
127 | {
128 | kernel<<<1, 1>>>();
129 | }
130 |
131 | int main()
132 | {
133 | Device1();
134 | cudaDeviceSynchronize();
135 | return 0;
136 | }
137 |
138 | // error: calling a __global__ function("kernel") from a __device__ function("Device1") is only allowed on the compute_35 architecture or above
139 |
140 | // -----------------------------------------------------------
141 | #include
142 |
143 | __global__ void kernel1()
144 | {
145 | printf("kernel1\n");
146 | }
147 |
148 | __global__ void kernel2()
149 | {
150 | printf("kernel2\n");
151 | }
152 |
153 | int main()
154 | {
155 | kernel1<<<1, 1>>>();
156 | printf("CPU here\n");
157 | kernel2<<<1, 1>>>();
158 | cudaDeviceSynchronize();
159 | printf("CPU also here\n");
160 | return 0;
161 | }
162 |
163 | // CPU here
164 | // kernel1
165 | // kernel2
166 | // CPU also here
167 |
--------------------------------------------------------------------------------
/Chapter12/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | When we talk about parallelism, we often encounter the phenomenon of data hazard, a bug that can be quite headache-inducing to fix because it is a logical error. However, now we have tools like NVIDIA Compute Sanitizer which make fixing this bug somewhat easier. In this article, I will explain what a data hazard is and illustrate it.
6 |
7 | It would be better if you read the article on [Synchronization - Asynchronization](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter08) before reading this one.
8 |
9 |
10 |
Data Hazard
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | The phenomenon where multiple threads read and write a certain value leads to conflicts, and this phenomenon is called a data hazard.
19 |
20 | When discussing data hazards, we encounter two issues:
21 |
22 | - Data Race: This usually relates to "write after read" or "read after write", but it mainly focuses on simultaneous access (reading and/or writing) to a stored variable without synchronization. This can lead to a situation where one thread overwrites data that another thread is reading or preparing to write, leading to a conflict in data value.
23 | - Race Condition: This concept is broader and not limited to data access. A race condition occurs when the final result of a system shows an undefined behavior or event.
24 |
25 | **In summary, just remember: when coding in CUDA, be mindful of the phenomenon where multiple threads access the same value for processing.**
26 |
27 |
28 |
29 |
Illustration
30 |
31 |
32 |
33 | ```
34 | #include
35 | #include
36 |
37 | #define ARRAY_SIZE 4
38 |
39 |
40 | __global__ void sum(int d_array)
41 | {
42 | int id = blockIdx.x * blockDim.x + threadIdx.x;
43 |
44 | for (int stride = 1; stride < 4; stride *= 2)
45 | {
46 | // __syncthreads(); -----> barrier
47 |
48 | if (threadIdx.x % (2 * stride) == 0)
49 | {
50 | d_array[id] += d_array[id + stride];
51 | }
52 | }
53 | printf("blockIdx.x=%d --> %d\n", blockIdx.x, d_array[id]);
54 | }
55 |
56 | int main()
57 | {
58 | int h_array[4] = {1, 2, 3, 4};
59 | int *d_array;
60 |
61 | cudaMalloc((void **)&d_array, sizeof(int) * ARRAY_SIZE);
62 | cudaMemcpy(d_array, h_array, sizeof(int) * ARRAY_SIZE, cudaMemcpyHostToDevice);
63 |
64 | sum<<<1, 4>>>(d_array);
65 |
66 | cudaFree(d_array);
67 |
68 | return 0;
69 | }
70 | ```
71 |
72 | This is an illustration of the principle of how the code operates.
73 |
74 |
75 |
76 |
77 |
78 | Here, we do not synchronize the threads, leading to a data race (in step 1: before 3+4 is completed, it moves to step 2, so it's 3 + 3 = 6 instead of 3 + 7 = 10).
79 |
80 |
81 |
82 |
83 |
84 | To solve this problem, we just need to place a barrier to make the threads wait for each other until the slowest threads have finished, using the command **__syncthreads().**
85 |
86 |
87 |
88 |
89 |
90 | And the output after adding syncthreads.
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/Chapter08/README.md:
--------------------------------------------------------------------------------
1 | To answer the question in [Chapter07](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter07), we need to go through two concepts: synchronization and asynchronization.
2 |
3 |
4 |
5 |
6 |
Synchronization and Asynchronization
7 |
8 |
9 | Before explaining these two concepts, let's go through an example to help you visualize better.
10 |
11 | Example:
12 | In a school: we have N assignments, K classes, and each class has J students (N > K * J or N = K * J), and the task is to distribute these N assignments to J students. Here, I will use specific numbers to make it easier for you to understand.
13 |
14 | So, the problem becomes: 2048 assignments, 32 classes, 32 students per class. Our task is to distribute these 2048 assignments for students to solve. In this case, we will assign 64 assignments to each class (32 * 64 = 2048), meaning each student will handle 2 assignments.
15 |
16 | Because **each student will have a different assignment-solving speed**, the **completion speed of each class** will be different. But an interesting point here is the **synchronization among students within the same class**, meaning when they complete one assignment, they will wait until the last person (the slowest) finishes that assignment before starting the next one.
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 | Here, **the students are like threads, the classes are like blocks, and N assignments are the data**. Similar to the example mentioned, the **blocks will complete their work at different speeds** (not sequentially), and the **threads within the same block will also have different completion speeds**, but they will be synchronized, meaning they will only move on to the next task when the slowest thread finishes.
26 |
27 | The reason for synchronization is the warp mechanism, which takes 32 tasks for 32 threads at a time (I explained this in the [Warp section in Chapter03](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter03)).
28 |
29 | To avoid a situation where waiting for one person affects everyone, we have the concept of **Latency Hiding**.
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | As shown in the image, when one warp is being executed but takes too long (possibly due to waiting for some slow threads), there is an automatic mechanism where another warp will be automatically replaced to execute (meaning there will be 32 new assignments for 32 students to work on).
38 |
39 | Thanks to this, the threads are always busy, or in other words, we should **always keep the threads busy**.
40 |
41 | Because of the synchronization mechanism, we encounter a phenomenon called **thread divergence** (this phenomenon significantly affects our performance).
42 |
43 | **Thread divergence** occurs when threads perform different calculations or encounter branching conditions. Due to the synchronization mechanism, they have to wait for each other.
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | In other words, instead of processing Path A and Path B in parallel, here it has to be done sequentially. Threads that satisfy Path A work first, while those satisfying Path B have to wait until Path A is finished.
52 |
53 | ### Summary:
54 |
55 |
56 |
57 |
58 |
59 |
60 | Threads within the same block synchronize but only when moving to the next task within the same job; they are still asynchronous (similar to the assignment-solving speed of the students in my example). Blocks (or threads from different blocks) are asynchronous.
61 |
62 | Through this article, you probably have a better understanding of why we get such output.
63 | 
64 |
65 |
--------------------------------------------------------------------------------
/Chapter11/streaming.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | // Convenience function for checking CUDA runtime API results
7 | // can be wrapped around any runtime API call. No-op in release builds.
8 | inline cudaError_t checkCuda(cudaError_t result)
9 | {
10 | #if defined(DEBUG) || defined(_DEBUG)
11 | if (result != cudaSuccess)
12 | {
13 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
14 | assert(result == cudaSuccess);
15 | }
16 | #endif
17 | return result;
18 | }
19 |
20 | __global__ void kernel(float *a, int offset)
21 | {
22 | int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
23 | float x = (float)i;
24 | float s = sinf(x);
25 | float c = cosf(x);
26 | a[i] = a[i] + sqrtf(s * s + c * c);
27 | }
28 |
29 | float maxError(float *a, int n)
30 | {
31 | float maxE = 0;
32 | for (int i = 0; i < n; i++)
33 | {
34 | float error = fabs(a[i] - 1.0f);
35 | if (error > maxE)
36 | maxE = error;
37 | }
38 | return maxE;
39 | }
40 |
41 | int main(int argc, char **argv)
42 | {
43 | const int blockSize = 256, nStreams = 4;
44 | const int n = 4 * 1024 * blockSize * nStreams;
45 | const int streamSize = n / nStreams;
46 | const int streamBytes = streamSize * sizeof(float);
47 | const int bytes = n * sizeof(float);
48 |
49 | // Allocate pinned host memory and device memory
50 | float *a, *d_a;
51 | checkCuda(cudaMallocHost((void **)&a, bytes)); // Host pinned
52 | checkCuda(cudaMalloc((void **)&d_a, bytes)); // Device
53 |
54 | float ms; // Elapsed time in milliseconds
55 |
56 | // Create events and streams
57 | cudaStream_t stream[nStreams];
58 |
59 | cudaEvent_t startEvent, stopEvent;
60 | checkCuda(cudaEventCreate(&startEvent));
61 | checkCuda(cudaEventCreate(&stopEvent));
62 |
63 | for (int i = 0; i < nStreams; ++i)
64 | checkCuda(cudaStreamCreate(&stream[i]));
65 |
66 | // Baseline case - sequential transfer and execute
67 | memset(a, 0, bytes);
68 | checkCuda(cudaEventRecord(startEvent, 0));
69 | checkCuda(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
70 | kernel<<>>(d_a, 0);
71 | checkCuda(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
72 | checkCuda(cudaEventRecord(stopEvent, 0));
73 | checkCuda(cudaEventSynchronize(stopEvent));
74 | checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
75 | printf("Time for sequential transfer and execute (ms): %f\n", ms);
76 | printf(" max error: %e\n", maxError(a, n));
77 |
78 | // Asynchronous version 1: loop over {copy, kernel, copy}
79 | memset(a, 0, bytes);
80 | checkCuda(cudaEventRecord(startEvent, 0));
81 | for (int i = 0; i < nStreams; ++i)
82 | {
83 | int offset = i * streamSize;
84 | checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
85 | streamBytes, cudaMemcpyHostToDevice,
86 | stream[i]));
87 |
88 | kernel<<>>(d_a, offset);
89 |
90 | checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
91 | streamBytes, cudaMemcpyDeviceToHost,
92 | stream[i]));
93 | }
94 |
95 | checkCuda(cudaEventRecord(stopEvent, 0));
96 | checkCuda(cudaEventSynchronize(stopEvent));
97 | checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
98 | printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
99 | printf(" max error: %e\n", maxError(a, n));
100 |
101 | // Asynchronous version 2:
102 | // Loop over copy, loop over kernel, loop over copy memset(a, 0, bytes);
103 | memset(a, 0, bytes);
104 | checkCuda(cudaEventRecord(startEvent, 0));
105 | for (int i = 0; i < nStreams; ++i)
106 | {
107 | int offset = i * streamSize;
108 | checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
109 | streamBytes, cudaMemcpyHostToDevice,
110 | stream[i]));
111 | }
112 |
113 | for (int i = 0; i < nStreams; ++i)
114 | {
115 | int offset = i * streamSize;
116 | kernel<<>>(d_a, offset);
117 | }
118 |
119 | for (int i = 0; i < nStreams; ++i)
120 | {
121 | int offset = i * streamSize;
122 | checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
123 | streamBytes, cudaMemcpyDeviceToHost,
124 | stream[i]));
125 | }
126 |
127 | checkCuda(cudaEventRecord(stopEvent, 0));
128 | checkCuda(cudaEventSynchronize(stopEvent));
129 | checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
130 | printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
131 | printf(" max error: %e\n", maxError(a, n));
132 |
133 | // Cleanup
134 | checkCuda(cudaEventDestroy(startEvent));
135 | checkCuda(cudaEventDestroy(stopEvent));
136 | for (int i = 0; i < nStreams; ++i)
137 | checkCuda(cudaStreamDestroy(stream[i]));
138 | cudaFree(d_a);
139 | cudaFreeHost(a);
140 |
141 | return 0;
142 | }
143 |
144 |
--------------------------------------------------------------------------------
/Chapter10/README.md:
--------------------------------------------------------------------------------
1 | In this article, I will discuss the concept of pinned memory - please note that it will be related to the next article (streaming), so it would be great if you could gain the knowledge in this article.
2 |
3 |
4 |
Pinned memory
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | Before explaining what pinned memory is, I will go over how computers operate when we code for a clearer understanding - and rest assured, I will explain in a simple and easy-to-understand way, so it's not necessary for you to have knowledge about hardware.
13 |
14 |
15 |
16 |
How computer work
17 |
18 |
19 | When it comes to memory, we always have two perspectives: Physical memory and virtual memory (or logical memory).
20 |
21 | - Physical memory: the memory directly installed on the CPU and RAM sticks, connected directly. These are the memory cells located on the motherboard.
22 | - Virtual memory: is an abstract concept (making it easier for programmers to manipulate), which is the memory managed by the operating system or drivers. The OS creates logical memory by using a part of the CPU's space through mapping from logical to physical address in RAM.
23 |
24 | The space of virtual memory is much larger than that of physical memory
25 |
26 |
27 |
28 |
29 |
30 |
31 | Initially, when we allocate memory for the CPU, it is placed on RAM - Main memory (physical) or pageable memory (logical). Copying data from the CPU to the GPU when the data is on pageable memory can cause a significant problem known as swapping.
32 |
33 | When data is stored in pageable memory, it might not always be ready for quick access because the system can move this data to the hard disk (magnetic disk) to free up RAM for other tasks. This process is called 'swapping'.
34 |
35 | When data needs to be transferred from the CPU to the GPU and that data is on the hard disk (due to swapping), you encounter a 'missing data' issue. This happens because the GPU requires quick access to the data, but the data is not readily available in RAM.
36 |
37 | ===> Therefore, CUDA has implemented a mechanism before copying from CPU to GPU to push all the data that needs to be copied to pinned memory (simply understood as pinning the necessary data so it cannot be moved down to the hard disk) and only after pushing everything to pinned memory does it start copying from the host to the device.
38 |
39 | cudaMemcpy: means it will take 2 times copying (from pageable memory ==> pinned memory ==> device memory).
40 |
41 | Instead of requiring 2 times copying, NVIDIA developed a function that allows us to specify from the beginning that data be stored in pinned memory (only taking 1 time copying from host to device).
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
Code
51 |
52 |
53 | ```
54 | #include
55 | #include
56 |
57 |
58 | using std::cout;
59 | using std::end;
60 |
61 |
62 | __global__ void vectorAdd(int *a, int *b, int *c, int N)
63 | {
64 | int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
65 |
66 | if (tid < N)
67 | {
68 | c[tid] = a[tid] + b[tid];
69 | }
70 | }
71 |
72 | void verify_result(int *a, int *b, int *c, int N)
73 | {
74 | for (int i = 0; i < N; i++)
75 | {
76 | assert(c[i] == a[i] + b[i]);
77 | }
78 | }
79 |
80 | int main()
81 | {
82 | constexpr int N = 100;
83 | size_t bytes = sizeof(int) * N;
84 |
85 | // Vectors for holding the host-side (CPU-side) data
86 | int *h_a, *h_b, *h_c;
87 |
88 | // Allocate pinned memory
89 | cudaMallocHost(&h_a, bytes);
90 | cudaMallocHost(&h_b, bytes);
91 | cudaMallocHost(&h_c, bytes);
92 |
93 | for (int i = 0; i < N; i++)
94 | {
95 | h_a[i] = rand() % 100;
96 | h_b[i] = rand() % 100;
97 | }
98 |
99 | // Allocate memory on the device
100 | int *d_a, *d_b, *d_c;
101 | cudaMalloc(&d_a, bytes);
102 | cudaMalloc(&d_b, bytes);
103 | cudaMalloc(&d_c, bytes);
104 |
105 |
106 |
107 | // Copy data from the host to the device (CPU -> GPU)
108 | cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
109 | cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
110 |
111 |
112 |
113 |
114 | int NUM_THREADS = 1 << 10;
115 | int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
116 |
117 |
118 |
119 | // Execute the kernel
120 | vectorAdd<<>>(d_a, d_b, d_c, N);
121 |
122 |
123 |
124 |
125 |
126 | // Copy data from device to host (GPU -> CPU)
127 | cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
128 |
129 |
130 |
131 |
132 | // Check result for errors
133 | verify_result(h_a, h_b, h_c, N);
134 |
135 | // Free pinned memory
136 | cudaFreeHost(h_a);
137 | cudaFreeHost(h_b);
138 | cudaFreeHost(h_c);
139 |
140 | // Free memory on device
141 | cudaFree(d_a);
142 | cudaFree(d_b);
143 | cudaFree(d_c);
144 |
145 | cout << "COMPLETED SUCCESSFULLY\n";
146 |
147 | return 0;
148 | }
149 |
150 | ```
151 |
152 | It simply differs in that the allocation of data for the host is done using cudaMallocHost.
153 |
154 |
155 |
Exercise
156 |
157 |
158 | Write a script to compare the time taken to copy data from H2D - D2H between pageable memory and pinned memory
159 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
Parallel Computing Using Cuda-C
4 |
5 |
6 | This repository contains code examples and resources for parallel computing using CUDA-C. CUDA-C is a parallel computing platform and programming model developed by NVIDIA, specifically designed for creating GPU-accelerated applications.
7 |
8 | The goal of this repository is to provide beginners with a starting point to understand parallel computing concepts and how to utilize CUDA-C to leverage the power of GPUs for accelerating computationally intensive tasks. Whether you are a student, researcher, or developer interested in parallel computing, this repository aims to provide a practical guide and code examples to get you started.
9 |
10 |
11 |
12 |
Introduction to CUDA-C
13 |
14 |
15 | CUDA-C is an extension of the C programming language that allows developers to write code that can be executed on NVIDIA GPUs. It provides a set of language extensions, libraries, and tools that enable developers to harness the power of parallel processing on GPUs.
16 |
17 | CUDA-C allows you to write parallel code using the CUDA programming model, which includes defining kernels (functions that execute on the GPU) and managing data transfers between the CPU and GPU. By writing CUDA-C code, you can achieve significant speedups for computationally intensive tasks compared to running the same code on the CPU alone.
18 |
19 |
20 |
Why we need Cuda-C
21 |
22 |
23 | 
24 |
25 |
26 |
27 | With the exponential growth of data and increasing demands from users, CPUs alone are no longer sufficient for efficient processing. GPUs offer parallel processing capabilities, making them well-suited for handling large-scale computations. CUDA-C, developed by NVIDIA, enables developers to leverage GPUs for accelerated processing, resulting in faster and more efficient data processing.
28 |
29 |
30 |
31 |
32 |
Getting Started
33 |
34 |
35 | ### If your computer has GPU
36 | Following these steps in NIVIDA to install [Cuda Toolkit](https://developer.nvidia.com/cuda-downloads)
37 |
38 | - If you are using Linux, I advise you to watch [this video](https://www.youtube.com/watch?v=wxNQQP9U1Bc)
39 |
40 | - If you are using Windows, this is [your video](https://www.youtube.com/watch?v=cuCWbztXk4Y&t=49s)
41 |
42 | ### If your computer doesn't have GPU
43 | - Don't worry; I'll demonstrate how to set up and use Google Colab to code [in here](https://medium.com/@giahuy04/the-easiest-way-to-run-cuda-c-in-google-colab-831efbc33d7a)
44 |
45 |
46 |
47 |
Table of Contents
48 |
49 |
50 | - [Chapter01: Demystifying CPUs and GPUs: What You Need to Know](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter01)
51 |
52 | - [Chapter02: How the way a computer works](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter02)
53 |
54 | - [Chapter03: Terminology in parallel programming](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter03)
55 | - [Chapter04: Hello world Cuda-C](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter04)
56 | - [Chapter05: The operational mechanism of CPU-GPU](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter05)
57 | - [Chapter06: Memory Types in GPU](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter06)
58 | - [Chapter07: Using GPU memory](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter07)
59 | - [Chapter08: Synchronization and Asynchronization](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter08)
60 | - [Chapter09: Unified memory](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter09)
61 | - [Chapter10: Pinned memory](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter10)
62 | - [Chapter11: Streaming](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter11)
63 | - [Chapter12: Data Hazard](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter12)
64 | - [Chapter13: Atomic Function](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter13)
65 | - [Chapter14: Shared memory](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter14)
66 |
67 |
68 |
69 |
70 |
Resources
71 |
72 |
73 | In addition to the code examples, this repository provides a curated list of resources, including books, tutorials, online courses, and research papers, to further enhance your understanding of parallel computing and CUDA-C programming. These resources will help you delve deeper into the subject and explore advanced topics and techniques.
74 |
75 | - [NVIDIA Practices_Guide 2023](https://docs.nvidia.com/cuda/pdf/CUDA_C_Best_Practices_Guide.pdf)
76 | - [NVIDIA Programming_Guide 2023](https://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf)
77 | - [GPU Programming 2021 in youtube](https://www.youtube.com/watch?v=wFlejBXX9Gk&list=PL3xCBlatwrsXCGW4SfEoLzKiMSUCE7S_X)
78 | - [Cuda Programming 2023 in youtube](https://www.youtube.com/watch?v=cvo3gnInQ7M&list=PL1ysOEBe5977vlocXuRt6KBCYu_sdu1Ru)
79 | - [Programming Massively 2022 in youtube](https://www.youtube.com/playlist?list=PLRRuQYjFhpmubuwx-w8X964ofVkW1T8O4)
80 | - [Cuda training series 2022-2023 in youtube](https://www.youtube.com/playlist?list=PL6RdenZrxrw-zNX7uuGppWETdxt_JxdMj)
81 | - [Programming Heterogeneous Computing Systems with GPUs 2023 in youtube](https://www.youtube.com/playlist?list=PL5Q2soXY2Zi-qSKahS4ofaEwYl7_qp9mw)
82 | - [Cuda Thread Indexing cheatsheet](https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf)
83 |
84 |
85 |
--------------------------------------------------------------------------------
/Chapter11/README.md:
--------------------------------------------------------------------------------
1 |
2 | In this article, I will guide you through a technique to optimize a program in CUDA C. This technique is relatively simple, but it will be even more beneficial if you have read the articles on [Pinned memory](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter10) and [Async-Sync](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter08).
3 |
4 |
5 |
6 |
Streaming
7 |
8 |
9 |
10 | As I mentioned, the CPU and GPU are two separate components, and as a result, the execution of code on the CPU and GPU occurs independently of each other, without any mutual interference. We will leverage this characteristic to further optimize our program in a parallel manner.
11 |
12 | 
13 |
14 | When it comes to CUDA-C code, we need to focus on two concepts: compute bound and memory bound (which can be understood simply as two issues: spending too much time on computation or on memory load/store operations). The Streaming technique will help us address the memory-bound aspect.
15 |
16 |
17 | I will delve deeper into the concepts of compute bound and memory bound in the [NVIDIA-Tools series](https://github.com/CisMine/Guide-NVIDIA-Tools) , so if you're interested, you can read more about them there. Here, I'll concentrate on the code.
18 |
19 |
20 | As I mentioned, in order to run code on the GPU, we have to copy data from the CPU to the GPU, which can be quite time-consuming (because if we use cudaMemcpy, we have to wait for the entire copy to complete before proceeding to the next step). Instead of waiting for the entire copy, we can break it down into smaller parts (batches) to optimize this process (similar to what's shown in the diagram).
21 |
22 | **There are two main components that always appear when we talk about Streaming: Pinned memory and Stream branches.**
23 |
24 | - Pinned memory: The reason pinned memory is used in the Streaming technique is that it is small and fast. As mentioned earlier, we divide the data into smaller portions for copying, so we only need a small amount of memory, and pinned memory can fulfill this requirement quickly.
25 |
26 | - Stream branches: This is how threads are organized on the GPU so that they can work independently and in parallel on the same data. Think of each stream branch as a manager responsible for dividing tasks among threads. If you don't specify branching, the default behavior will apply.
27 |
28 |
29 |
30 |
31 |
32 |
33 |
Code
34 |
35 |
36 |
37 | As I mentioned earlier, a large chunk of data will be divided and processed. The splitting and processing will be done in parallel rather than sequentially, thanks to the stream branch mechanism.
38 |
39 | The first step when using streaming is to create stream branches by:
40 |
41 | ```
42 | cudaStream_t stream[nStreams]
43 | ```
44 |
45 | The rest is quite similar, with just a slight difference in:
46 |
47 | ```
48 | cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice,stream[i])
49 | kernel<<>>(d_a, offset);
50 | ```
51 |
52 | As I mentioned, because we're copying only a portion, we need to determine an index, also known as an offset, to maintain the correct copying across different branches.
53 |
54 | In this context, the third parameter in the kernel, '0,' refers to shared memory, which you don't need to be concerned about at this point.
55 |
56 | **Here, there are two methods for using streaming, and you can choose the method that suits your computer.**
57 |
58 | ###
59 |
Asynchronous version 1: loop over {copy, kernel, copy}
60 |
61 |
62 | 
63 |
64 | ```
65 | for (int i = 0; i < nStreams; ++i) {
66 | int offset = i * streamSize;
67 | cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
68 | kernel<<>>(d_a, offset);
69 | cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
70 | }
71 | ```
72 |
73 | ###
74 |
Asynchronous version 2: loop over copy, loop over kernel, loop over copy
75 |
76 |
77 | 
78 |
79 | ```
80 | for (int i = 0; i < nStreams; ++i) {
81 | int offset = i * streamSize;
82 | cudaMemcpyAsync(&d_a[offset], &a[offset],
83 | streamBytes, cudaMemcpyHostToDevice, cudaMemcpyHostToDevice, stream[i]);
84 | }
85 |
86 | for (int i = 0; i < nStreams; ++i) {
87 | int offset = i * streamSize;
88 | kernel<<>>(d_a, offset);
89 | }
90 |
91 | for (int i = 0; i < nStreams; ++i) {
92 | int offset = i * streamSize;
93 | cudaMemcpyAsync(&a[offset], &d_a[offset],
94 | streamBytes, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost, stream[i]);
95 | }
96 | ```
97 |
98 |
99 |
Exercise
100 |
101 |
102 | - Code to compare the execution time of the two methods
103 | - How to determine how many branches to divide into
104 |
105 | Hint: It's not always better to divide into many branches, as I mentioned, stream branches are like managers, so assuming there are few tasks but we hire too many managers would be wasteful, while one manager would be enough.
106 |
--------------------------------------------------------------------------------
/Chapter01/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
Demystifying CPUs and GPUs: What You Need to Know
4 |
5 |
6 | It’s possible that the terms CPUs and GPUs are familiar to some people, while others may have only heard of them without fully understanding their purpose. In this post, I will do my best to provide `a clear and simple explanation` for those who are unfamiliar with these terms.
7 |
8 |
9 |
10 |
11 |
CPUs
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
What is CPUs?
20 |
21 |
22 |
23 | CPUs or Central Processing Units are the primary components that perform most of the processing tasks in a computer system. They are also known as the “brain” of a computer.
24 |
25 | The CPU is responsible for executing instructions that are stored in the computer’s memory, performing arithmetic and logical operations, and managing data flows between different parts of the computer. The performance of a CPU is usually measured by clock speed, which is the number of instructions it can execute per second, and the number of cores, which refers to the number of independent processing units within the CPU.
26 |
27 |
28 |
Development and the problems
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | The general trend of these graphs shows a significant increment, but there appears to be a pause or drop in the graphs from 2004 to 2005, particularly with frequency. `So what happened at that time?`
41 |
42 | ### Before I go any further, let me just say that this article will provide “a clear and simple” explanation for those who are unfamiliar with these terms. As a result, I won’t go into great detail about what actually occurred, but if you’re interested, check out this [link](http://www.edwardbosworth.com/My5155_Slides/Chapter01/ThePowerWall.htm)
43 |
44 | The two major issues that CPUs faced at the time were the `Power wall` and `Memory wall`.
45 |
46 | To put it simply, more power means quicker processing from more CPUs.However, more powerful CPUs may require a higher Voltage to maintain stability at such speeds so we couldn’t clock processors faster.
47 |
48 | Another problem was the latency when accessing the memory; if a computer has a powerful CPU but has poor memory access, it will take a long time, which is why the efficiency of many workloads is determined by memory access times. **And that is how the parallel era occurred**
49 |
50 |
51 |
52 |
53 |
GPUs
54 |
55 |
56 | 
57 |
58 |
59 |
60 |
what is GPUs?
61 |
62 |
63 | GPUs (Graphics Processing Units) are specialized hardware components designed to accelerate the processing of graphics and parallel computing tasks. They were originally developed for use in computer gaming and graphical applications, but are now widely used in scientific and engineering applications as well.
64 |
65 |
66 | 
67 |
68 | These graphs demonstrate how much more quickly GPUs calculate than CPUs, so which features can GPUs perform at that level?
69 |
70 |
71 |
Parallel processing
72 |
73 |
74 | lets me give an example what is parallel processing:
75 | Assume that a teacher gives a class 10 questions to answer. The simplest solution is to have the best student in the class complete all 10 questions. However, if we can find and train the remaining 9 students to be as good as the best student, we can speed up the process of answering questions by 10 times and that’s how GPUs work.
76 |
77 | In conclusion, saving time by breaking up large tasks into smaller ones that can be handled simultaneously is the best benefit of parallel processing. However, there are some limitations to task division, such as when a teacher assigns a class N questions but we are unable to find N students to prepare for and complete those tasks.
78 |
79 | Parallel calculation techniques
80 | Parallel calculation techniques is the big evolution of sequential computing. Parallel calculation techniques refer to the methods and algorithms used to divide a large computational task into smaller subtasks that can be executed simultaneously on multiple processors or computers.
81 |
82 | Parallel computing allows for the use of multiple processing units to work together to solve complex problems faster and more efficiently than traditional sequential computing. Parallel computation can be achieved through various approaches such as shared-memory, distributed-memory, and hybrid models.
83 |
84 |
85 |
86 |
Conclusion
87 |
88 |
89 | You now understand how to use a parallel program to get around the CPUs’ limitations.
90 |
91 | My next post will discuss parallel programming, the languages we use, and a deeper explanation of how graphics processing units (GPUs) operate.
92 |
93 | Last but not least, thank you for reading up until now. Please don’t be hesitant to star if you enjoyed it.
94 |
95 |
96 |
97 |
References
98 |
99 |
100 | https://www.cs.princeton.edu/~dpw/courses/cos326-12/lec/15-parallel-intro.pdf
101 | http://www.edwardbosworth.com/My5155_Slides/Chapter01/ThePowerWall.htm
102 |
103 |
--------------------------------------------------------------------------------
/Chapter02/README.md:
--------------------------------------------------------------------------------
1 |
2 |
How the way a computer works
3 |
4 |
5 | In this article, I will briefly discuss how computers work in terms of retrieving and processing data using an extremely intuitive and easy-to-understand example.
6 | And please note that this example will be referred to quite often in parallel programming lessons, so I hope you read it carefully.
7 |
8 |
9 |
Read-Write data
10 |
11 |
12 | Example: Assuming you have 1024 cookies and there are 32 children waiting in line to receive cookies, there are two ways you can distribute the cookies.
13 |
14 |
15 |
16 |
17 |
18 | ## Method 1: In this method, you distribute the cookies in batches of 32. Here’s how it works:
19 | - The first child in line receives a batch of 32 cookies.
20 | - The second child in line also receives a batch of 32 cookies.
21 | - This process continues, and each child in line receives a batch of 32 cookies until you reach the 32nd child.
22 | - The 32nd child receives the last batch of 32 cookies.
23 |
24 | ## Method 2: In this method, you distribute the cookies one at a time and have the child return to the end of the line after receiving a cookie. Here’s how it works:
25 |
26 | - The first child in line receives one cookie and then goes to the back of the line.
27 | - The second child in line receives one cookie and also goes to the back of the line.
28 | - This process continues, with each child receiving one cookie and then joining the end of the line, until you have gone through all 32 children in the line.
29 | - After going through the entire line once, you repeat the process for a total of 32 iterations.
30 | - During the 32nd iteration, the last cookie (the 1,024th cookie) is given to the 32nd child in line.
31 |
32 |
33 | **In the given scenario, it may seem obvious that Method 1 is faster than Method 2. However, when it comes to computers, Method 2 is actually faster. Now, let’s analyze the reasons why computers would choose Method 2.**
34 |
35 | ## Let’s analyze the example
36 |
37 | In this analogy, the cookies represent the data that needs to be processed, and the children represent the individuals processing that data
38 |
39 | there are two important points about how computers handle data that are often overlooked:
40 |
41 | - Sequential Processing: After a computer finishes processing one set of data, it moves on to the next set of data. It cannot simultaneously process multiple sets of data at the same time. This sequential processing is due to the single-threaded nature of most traditional computing systems.
42 | - Data Transfer and Processing: When data is read by the computer for processing, it is typically copied to a designated area in memory before actual processing occurs. The data cannot be processed directly at the location where it is read. This copying of data to memory allows for efficient processing and manipulation of the data without altering the original source.
43 |
44 | **Both these aspects contribute to the overall functioning of a computer in handling and processing data.**
45 |
46 | ## Analyze the 1st method
47 | In the given example, when the first child comes up to receive a cookie, he has to take it to a different place before he can sit down and eats it. After finishing the first cookie, he returns to receive the second cookie and repeats the process. This means that each child must consume their cookies one at a time, rather than eating all 32 cookies simultaneously.
48 |
49 | This process repeats 32 times, and during this time, the remaining 31 children have to wait for their turn. You can imagine that the 32nd child has to wait for a certain duration until it’s his turn to receive and eat his cookies.
50 |
51 | Another drawback of Method 1 is that the computer needs to perform an additional calculation to determine how many cookies to distribute to each child. In this case, the calculation is not very complex, but it can pose a problem when the number of cookies is not evenly divisible by the number of children.
52 |
53 | For example, let’s say we have 1055 cookies (1024 + 31). In this case, it would be ideal if each child processed 33 cookies (32 + 1), while the remaining child processes 32 cookies. However, due to the maximum limit of 32 cookies that each child can handle (it is relevant to warp), we encounter a situation where we have an extra cookie that needs to be processed.
54 |
55 | As a result, we would have a loop where the first child would have to process the remaining 31 odd cookies alone
56 |
57 | ## Analyze the 2nd method
58 | In Method 2, when the first child comes up to receive one cookie and then returns to the waiting line, he has time to consume the first cookie while waiting for his turn to receive the second cookie. This means that the first child is processing the first cookie while the other children can proceed with reading their respective data (cookies) and start processing them while waiting in line.
59 |
60 | By doing so, we have addressed the issue of the remaining 31 children not having to wait for the first child to finish processing his cookie. The second child, for example, can start “reading” his data (the cookie) and begin processing it while waiting in line. This allows for a more parallel processing approach, where the children can process their data concurrently rather than having to wait for each other.
61 |
62 | Another advantage of Method 2 is that we don’t need to perform additional calculations. In the case of having 1055 cookies (1024 + 31), Method 2 allows us to distribute the 31 odd cookies evenly among the 31 remaining children. Each child can handle one extra cookie along with their designated batch. This eliminates the need for the first child to process all 31 odd cookies alone, as they are distributed evenly among the other children, enabling a more efficient distribution of workload.
63 |
64 | **Through the analysis, we can see that Method 2 has cleverly handled the data in a nearly parallel manner, while Method 1 is sequential.**
65 |
--------------------------------------------------------------------------------
/Chapter06/README.md:
--------------------------------------------------------------------------------
1 |
2 | One of the fascinating aspects of coding with CUDA is that we can **freely choose which memory to use** (meaning that when initializing a value or variable, we can specify it to be stored in a particular memory location) rather than letting the computer decide which memory to utilize. Thanks to this capability, we can fully leverage different memory types to optimize our program.
3 |
4 | In this article, I will introduce the various types of memory within the GPU, along with the specific purposes of each. It's important to note that this article is closely related to [Chapter03](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter03). Therefore, if you haven't read Chapter03 yet, I recommend doing so before proceeding with this article.
5 |
6 | `Please note that in this article, I will focus purely on the theoretical aspects. The practical usage details will be covered in a subsequent article.`
7 |
8 |
9 |
Memory Types in GPU
10 |
11 |
12 | Before delving into the various memory types within a GPU, it's **important to understand that when we talk about memory**, we generally categorize it into two main types: **physical memory and logical memory**.
13 |
14 | **Physical Memory:** This refers to the actual hardware memory in a computer. It includes components such as RAM modules and storage devices like hard drives (HDD/SSD). Physical memory is where data and programs are stored directly and can be accessed quickly by the processor.
15 |
16 | **Logical Memory (Virtual Memory):** This is the address space that the operating system and programs can access. Logical memory doesn't necessarily have a direct one-to-one correspondence with physical memory. The operating system typically manages the mapping between logical addresses and physical addresses. This management helps allocate and manage memory for programs running on the system.
17 |
18 | `You can understand it in a simple way: when we code, we interact with logical memory, and once the code is finished, the data located in logical memory will be mapped to physical memory (meaning the computer will operate in physical memory).`
19 |
20 | Now that we have a foundational understanding of memory, let's explore the specific memory types within a GPU and their purposes.
21 |
22 |
23 |
Logical view
24 |
25 |
26 | 
27 |
28 | As mentioned, **Blocks and Threads are logical concepts**, and due to the **SIMT mechanism**, it's important to understand how **Threads and Blocks are distributed and managed within the logical memory of the GPU**.
29 |
30 | Here, we have a familiar concept known as **scope**, which plays a crucial role in understanding how resources like Threads and Blocks are allocated and managed within the logical memory of the GPU.
31 |
32 | - **Local Memory:** **Each Thread can use its own local memory**, where it can store temporary variables. This has the **smallest scope** and is dedicated to **each individual Thread.**
33 |
34 | - **Shared Memory: Threads within the same Block** can share data through shared memory. This allows Threads within the same Block to communicate and access data faster compared to accessing global memory.
35 |
36 | - **Global Memory:** This is the **largest memory in the GPU** and can be accessed by **all Threads across all Blocks**. However, accessing global memory is typically slower than other memory types, so optimization is necessary to avoid performance degradation.
37 |
38 | - **Texture Memory and Constant Memory:** These are **special memory types in the GPU** optimized for accessing specific data types such as **textures or constant values. All Threads across all Blocks** can access these memory types
39 |
40 |
41 |
42 |
Physical view
43 |
44 |
45 | 
46 |
47 |
48 | It's quite analogous to Blocks and Threads, but in this context, we're talking about **Streaming Multiprocessors (SMs) and Streaming Processors (SPs)**. Each **SM possesses its own dedicated shared, cache, constant, and register memory**. However, **multiple SMs share the same global memory**.
49 |
50 | In this arrangement, SMs can manage their own local resources efficiently, such as shared memory for intra-block communication, and each SM's processing elements (SPs) can work independently on their assigned tasks. The sharing of global memory allows for coordination and data exchange between different SMs, enabling them to work together on larger computational tasks.
51 |
52 | **Next, we will examine the data access speeds of these memory types.**
53 |
54 |
55 |
56 |
Bandwidth of memory
57 |
58 |
59 | 
60 |
61 |
62 |
63 |
64 |
PCIe
65 |
66 |
67 | As I mentioned before, the **CPU (host)** and **GPU (device)** are two separate components, each with its own memory, and direct access between them is not possible. Data must be copied back and forth through the **PCIe (Peripheral Component Interconnect Express - bus)**, a commonly known interface.
68 |
69 | One of the key factors in deciding whether to move data from the CPU to the GPU for computation is the PCIe, which, as shown in the diagram, has the **slowest data transfer speed.**
70 |
71 | To address the challenge of copying a large amount of data from the CPU to the GPU, NVIDIA has introduced three methods:
72 |
73 | - Unified memory
74 | - Pinned memory
75 | - Streaming ( hidden latency )
76 |
77 | These methods will be discussed in more detail in upcoming articles. However, if you're curious, you can explore NVIDIA's articles: [How to Optimize Data Transfers in CUDA C/C++](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/)
78 |
79 |
80 |
Global memory
81 |
82 |
83 | **Global memory (also known as device memory)** is the **largest memory within the GPU** and, due to its size, it has a relatively **slower access speed**, ranking just behind the PCIe in terms of access latency.
84 |
85 | `To overcome this challenge, in the upcoming article, I will introduce a parallel programming technique that helps improve the access speed of global memory. This technique is surprisingly simple and effective, and I will guide you through its implementation.`
86 |
87 | Global memory in the GPU is analogous to RAM in the CPU. When we initialize a value in the GPU without specifying its storage location, it is automatically stored in the global memory.
88 |
89 | **From this point, it's evident that the primary purpose of global memory is to store large amounts of data.**
90 |
91 |
92 |
Shared/Cache memory
93 |
94 |
95 | **Shared memory and cache** are memory types with **fast access speeds**, but they have a **smaller capacity** compared to global memory.
96 |
97 | `Due to their high-speed nature, managing data in shared memory or cache can be more challenging than in global memory. One significant issue that heavily impacts access speed is called "bank conflict." We'll discuss bank conflict in more detail in upcoming articles.`
98 |
99 | **Because shared and cache memory offer fast access speeds, we often use them to store data during computations. The typical approach involves first copying all the data from the CPU to the GPU and storing it in global memory. Then, we break down the data into smaller portions (chunks) and push them into shared memory for computation. Once the computation is complete, the results are pushed back to global memory.**
100 |
101 |
102 |
103 |
Texture Memory và Constant Memory
104 |
105 |
106 | As mentioned earlier, **Texture Memory and Constant Memory are special memory types in the GPU** optimized for accessing **specific data types such as images (textures) or constant values**. The access speed of these two memory types is quite fast, comparable to shared memory.
107 |
108 | **Therefore, the purpose of using Texture Memory and Constant Memory is to optimize data access and reduce the computation load on shared memory. Instead of pushing all the data into shared memory, we can allocate a portion of the data to Texture Memory and Constant Memory. This allocation strategy helps enhance memory performance by leveraging the optimized access capabilities of Texture Memory and Constant Memory.**
109 |
110 |
111 |
112 |
113 |
A bit of interesting
114 |
115 |
116 | One small note is that the bandwidth values depicted in the diagram are illustrative, and for specific details about the bandwidth of each memory type on different devices, **NVIDIA has developed two extremely useful and convenient tools for optimization and debugging: Nsight Systems and Nsight Compute**. In the upcoming articles, I will explain how to use these two tools and their functionalities. Rest assured that even if your computer doesn't have a GPU, you can still benefit from these tools. Here are some examples of how you can analyze your program using these tools:
117 |
118 |
119 |
Nsight system
120 |
121 |
122 | 
123 |
124 |
125 |
Nsight compute
126 |
127 |
128 | 
129 |
130 |
131 |
Exercises
132 |
133 |
134 | - In the diagram, why are shared memory and L1 cache combined into a single memory instead of being separate memories?
135 |
136 | - Why does the access scope of L1 involve Threads within the same block, while L2 involves all Threads across the device?
137 |
--------------------------------------------------------------------------------
/Chapter03/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Terminology in parallel programming
3 |
4 |
5 |
6 | In this article, I'll provide an easy example to clarify a few parallel programming terms that are frequently used.
7 |
8 |
9 |
PHYSICAL and LOGICAL
10 |
11 |
12 | Before explaining the concepts of “PHYSICAL” and “LOGICAL,” let’s go through an example to provide a general and easy-to-understand overview.
13 |
14 | **Example:** A school has multiple classrooms, and each classroom contains several students (the number of classrooms and students per classroom varies depending on different schools, considering various factors, but money is one of the most important). Next, we have a mountain of tasks (the number of tasks is unknown — I will explain this clearly in the following section) that needs to be distributed among the students for processing, and we must adhere to the following **RULES:**
15 |
16 | - Each classroom can handle a **maximum of 1024 tasks.**
17 | - At any given time, within a classroom, **(32 * the number of warps) tasks** will be executed (I will explain what a warp is in the following section, and the number of warps will depend on the computer architecture). Therefore, if we have 5 classrooms, there will be (32 * the number of warps * 5) tasks executed. For N classrooms, there will be **(32 * the number of warps * N) tasks** executed.
18 |
19 | **To summarize, “PHYSICAL” can be understood that being observable, having a fixed quantity, and in this example, it refers to the students. On the other hand, “LOGICAL” refers to that cannot be directly observed but can be imagined or conceptualized, with an unspecified quantity. In this case, it represents the tasks.**
20 |
21 |
22 |
Physical corresponds to SM and SP
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 | In the given picture, we can see that there are 16 SMs, and each SM contains 32 cores. So, what are SM and cores?
31 |
32 | **Streaming Processors**(SPs or cores). The SPs are the main processing units on the GPU and are capable of executing computations concurrently on multiple data elements. You can think of SPs as individual students (1 student being 1 SP). **The more SPs (students) we have, the greater the number of tasks that can be processed concurrently.**
33 |
34 | **Streaming Multiprocessor**(SM or multiprocessor ) is a collection or grouping of SPs. It can be understood as a class or classroom that accommodates multiple SPs. The SM acts as a higher-level unit that manages and coordinates the execution of tasks across the SPs within it.
35 |
36 | The number of SMs and SPs may vary depending on the specific GPU architecture of each computer, and the count is typically fixed for a given GPU model.
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
Logical corresponds to thread, block and grid
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | In the given picture, we can see that there are 6 Blocks, and each Block contains 12 threads. So, what are Blocks and Thread?
54 |
55 | In simple terms:
56 |
57 | **Thread** can be understood as a unit of work, where each thread represents an individual task or job to be executed. So, one thread corresponds to one task.
58 |
59 | **Block** refers to a collection or group of threads. It represents a batch or a set of related tasks that are executed together. However, there is a maximum limit on the number of threads in a block, which is typically 1024 threads. This limitation is imposed by the computer’s architecture and applies to most GPUs.
60 |
61 | To summarize, a thread represents a single task or job, while a block is a grouping of threads that collectively perform a set of related tasks.
62 |
63 |
64 |
65 |
66 | **Grid** refers to a collection or set of blocks. It represents a higher-level grouping that encompasses multiple blocks. Each block consists of multiple threads, and multiple blocks together form a grid.
67 |
68 | **It is important to focus on understanding threads and blocks, as they are fundamental units of parallel execution.**
69 |
70 | **In this context, “tasks” refer to data. In each problem or task, we will have a different quantity of data to process. This is why I mentioned that the number of tasks is unspecified or unknown in advance, as it can vary depending on the specific problem or scenario.**
71 |
72 | The numbers (0,0) and (0,1) serve as indices to determine the position of a block and a thread, similar to a matrix. For example, a[1][2]. However, the indexing mechanism here has a slight difference, which I will explain in more detail in subsequent discussions.
73 |
74 | You might wonder why we divide the threads into separate blocks instead of having one large block for simplicity. If we did that, we would **violate RULE 1, which states that each classroom can handle a maximum of 1024 tasks.** Hence, we need to divide the threads (i.e., the number of tasks) into smaller blocks.
75 |
76 | One significant advantage of dividing threads into blocks is related to **RULE 2:** if we have 1024 threads, we only need one block. However, at any given time, it can only process (32 * 1) tasks (assuming the number of warps is 1). Thus, we have to wait for it to finish processing the first 32 tasks before moving on to the next 32 tasks, and this process continues sequentially.
77 |
78 | If we divide the threads into 32 blocks, with each block containing 32 threads (32 * 32 = 1024), then at any given time, it can process all 1024 threads without waiting for sequential execution.
79 |
80 |
81 | **It is similar to the analogy of eating cakes, where instead of sequentially processing 32 cakes (threads) at a time, we can process all 1024 cakes (threads) in parallel, resulting in faster overall processing that I mentioned in** [chapter02](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter02)
82 |
83 |
84 |
85 |
Summary
86 |
87 |
88 | SM(s) are the classrooms, 1 SP is a student, 1 Thread is a task, and Block is a collection of tasks. You can imagine a block as a box containing the tasks that need to be processed. Each SM processes a certain number of blocks (depending on the data distribution) — 1 SP can handle more than 1 thread (1 student can perform multiple tasks).
89 |
90 | Now, we encounter the question of how to distribute the tasks (blocks) among the classrooms (SMs) since SMs and Blocks are two separate concepts (physical and logical) that cannot directly interact. We need an intermediary called **WARP** to handle this distribution. So, what is a warp, and what is the significance of the number 32?
91 |
92 |
93 |
94 |
WARP: both physical and logical
95 |
96 |
97 | **A warp** refers to a group of threads that are executed together in parallel. In most GPU architectures, a warp typically consists of 32 threads. The GPU processes instructions in a SIMD (Single Instruction, Multiple Data) fashion, where a single instruction is executed on multiple data elements simultaneously within a warp. This means that all 32 threads within a warp execute the same instruction but operate on different data.
98 |
99 | The number 32 is significant because it represents the size of a warp in most GPU architectures. It determines how many threads are processed together in a parallel execution unit. By having multiple warps executing in parallel, the GPU can achieve high throughput and efficient utilization of its processing resources.
100 |
101 | Returning to the school example, the warp represents the leaders or class monitors (the number of warps or leaders depends on the specific computer architecture, such as Tesla, Fermi, etc., which I will explain in a separate discussion). The leaders (warps) have two tasks:
102 | - They go and fetch the blocks to bring back to their group for processing. In this case, the blocks are already assigned to each classroom (SM).
103 | - After bringing the blocks to their group, the warps distribute the blocks among the individual students for processing. The warp handles the second step, while the first step will be explained in a subsequent discussion.
104 |
105 | Once the warp brings the blocks to their group, they also have the additional responsibility of dividing the tasks among the members within the group. Each division can handle a maximum of 32 tasks at a time. After the completion of these 32 tasks, the next set of 32 tasks is distributed. In other words, at any given time, a warp distributes 32 threads, and each classroom can have multiple warps (leaders), resulting in the number of tasks at a given time being (32 * the number of warps) tasks.
106 |
107 | The reason why a warp can only distribute a maximum of 32 threads is due to the functionality of the computer architecture, which is applicable across different systems.
108 |
109 | **To summarize:**
110 | - Warp (Physical): Represents the leaders who lead the students in their group. In other words, warps control the SPs in task processing, where each SP is assigned a specific task.
111 | - Warp (Logical): Represents the control over the number of threads (tasks).
112 |
113 | One important note is that while we use the analogy of warps as group leaders, they are not counted as individual members within the classroom. For example, if a classroom has 50 students and 5 leaders, the number of SPs (students) is still 50, not 55.
114 |
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/Chapter05/README.md:
--------------------------------------------------------------------------------
1 | In this article, we will delve a bit deeper into the operational mechanisms of the CPU and GPU. Through this exploration, we will be able to address the final question that I mentioned in [chapter04](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter04)
2 |
3 |
4 |
5 |
The operational mechanism of CPU-GPU
6 |
7 |
8 | Back in the past, I once had a seemingly silly question: which is more important between the CPU and GPU, or when buying a computer, should we prioritize the CPU or the GPU? The answer would depend on our intended usage to determine which one should take precedence, as **CPUs and GPUs are designed for different purposes**
9 |
10 |
11 |
Approaches to processor Design
12 |
13 |
14 | 
15 |
16 | CPUs and GPUs are designed with two different objectives, so they cannot be directly compared. So, what are these two objectives?
17 |
18 |
19 |
20 |
21 |
CPU: Latency-Oriented Design
22 |
23 |
24 | This is a design approach aimed at reducing latency or response time in **processing complex tasks, but with a small number of tasks**. Why focus on reducing latency or response time? Because the CPU is designed with a **small number of high-quality, powerful, and efficient cores**. These cores can be considered multitasking (for example: executing application tasks, managing resources, controlling the system, processing information, etc.)
25 |
26 | ==> The CPU is used for processing complex tasks, so the design goal is to reduce latency or response time when handling those tasks.
27 |
28 | As shown in the diagram, we can see that the Control Unit and Cache occupy a significant portion of the area:
29 | - Large Control Unit: Optimizes the control of complex logic operations (AND, OR, XOR, etc.).
30 | - Large Cache: Helps reduce data access time.
31 |
32 | **Therefore, the CPU tends to prioritize using Cache and control.**
33 |
34 |
35 |
CPU: Hide short latency
36 |
37 |
38 | One of the methods to reduce the latency or response time of the CPU is **modest multithreading**. The operation of modest multithreading is as follows:
39 |
40 | - When the CPU executes a task, that task is divided into K smaller tasks.
41 | - The CPU generates a few shadow threads.
42 | - These shadow threads handle a small portion of the K tasks, while the remaining part is handled by the main threads.
43 | - When the main threads encounter issues leading to latency (e.g., waiting for data read, waiting for data transfer), the main threads switch to processing a small portion of the task that the shadow threads are handling. This continues until the latency is resolved (i.e., when the data is read or data transfer is completed), at which point the main threads switch back to their original task.
44 |
45 | **This mechanism is referred to as hide short latency.**
46 |
47 |
48 |
49 |
GPU: Throughput-Oriented Design
50 |
51 |
52 | This is a design approach aimed at increasing the capability to **process a large number of simple tasks quickly and efficiently**. So, why focus on processing multiple tasks simultaneously in a short period? Because GPUs (Graphics Processing Units) are designed with a **high number of cores**, even though these cores might be of **lower quality** compared to CPU cores. Therefore, the goal of GPUs is to handle a large volume of simple tasks
53 |
54 | **Therefore, GPUs excel at processing numerous simple tasks concurrently, necessitating the need to enhance their capability to handle a substantial workload in a short time.**
55 |
56 | As shown in the diagram, a significant portion of the area is occupied by compute units:
57 |
58 | - Multiple cores: Resulting in faster computations, which demands a steady supply of data. To meet this data computation requirement, GPUs are designed with an architecture that enhances bandwidth, making data transfer faster, and also equipped with larger memory (GPU's bandwidth-to-memory ratio or memory is significantly higher than that of CPUs).
59 |
60 | **Hence, GPUs are tailored for processing extensive data and parallel computing.**
61 |
62 |
63 |
GPU: Hide very high latency
64 |
65 |
66 | One of the methods to enhance the capability of processing a large number of tasks quickly within a short period on a GPU is by employing a **massive number of threads.**
67 |
68 | ==> In simpler terms, increasing the number of tasks being executed simultaneously at a given point effectively reduces the overall time required to complete all tasks.
69 |
70 |
71 |
72 |
Overall
73 |
74 |
75 | CPU processes complex tasks but in smaller quantities.
76 |
77 | GPU processes numerous tasks but are much simpler
78 |
79 | CPU is like a high-performance sports car with a tremendously powerful engine, while the GPU is like an extra-long bus designed to transport passengers. If only a small number of passengers need transportation, the CPU will be faster, but when the number of passengers is high, the GPU becomes an excellent choice.
80 |
81 | Multithreading: Dividing a large task into smaller subtasks and assigning multiple threads to handle them ==> results in short latency.
82 |
83 | Massive threads: Creating numerous threads to execute multiple tasks ==> leads to high latency.
84 |
85 | 
86 |
87 |
88 |
A Bit of interesting About GPUs
89 |
90 |
91 | Every time we mention **GPUs**, we refer to **graphics cards**. So, why is that name used? In the past, as the gaming industry, in particular, and fields related to graphics processing, in general, gained more recognition, the demand increased significantly. However, the quality wasn't progressing at the same rate. The reason was quite simple: for instance, a sharp and beautiful image might be around 1600x900 in resolution (this is just an example, but it helps illustrate the point). Imagine the CPU processing 1,440,000 (1600x900) pixels at once, and the task is to handle videos, meaning frames, which require processing many images with such pixel counts.
92 |
93 | As analyzed earlier, using the **CPU to process each pixel individually would be overly resource-intensive** (to put it simply, using a for loop for each pixel, where each iteration is a simple task). This wouldn't fully leverage the CPU's capabilities; it's akin to using a sports car to transport passengers. **That's when the concept of a Graphics card was invented**, designed solely for the purpose of pixel processing. Over time, this development led to increasingly superior GPU computing power, prompting the application of GPUs in various fields related to computation. This broader use is why the term **GPGPU (General-Purpose Computing on Graphics Processing Units) came into existence.**
94 |
95 |
96 |
97 |
98 |
Analyzing the Last Question in chapter04
99 |
100 |
101 | Before addressing the question, let's go over two concepts: **SIMD and SIMT**.
102 |
103 | **CPU: SIMD (Single Instruction, Multiple Data)** is a computer architecture often used in CPUs with the goal of **maximizing the processing of multiple data per instruction.**
104 |
105 | **GPU: SIMT (Single Instruction, Multiple Threads)** is a computer architecture **developed by NVIDIA** and utilized in GPUs with the aim of utilizing as **many threads as possible for each instruction.**
106 |
107 | Both SIMD and SIMT are parallel processing architectures used in computing devices like CPUs and GPUs.
108 |
109 | `At first glance, SIMD and SIMT might seem similar, but they are two distinct architectures with differences in certain aspects. The reason why CPUs are SIMD-based and GPUs are SIMT-based can be attributed to these differences.`
110 |
111 | As I mentioned:
112 |
113 | - CPU: Due to processing complex tasks, the mechanism of SIMD divides the complex task into sub-tasks and then processes these sub-tasks in parallel.
114 |
115 | - GPU: As it handles numerous simple tasks, the SIMT mechanism processes tasks in parallel.
116 |
117 | For example, let's consider the problem of adding two vectors (each containing N elements).
118 |
119 | SIMD: In this case, the task is to **perform vector addition**, and SIMD divides the vector addition problem into **N sub-problems**, which involve adding individual elements. Consequently, these **N sub-problems are executed in parallel.**
120 |
121 | SIMT: From the SIMT perspective, each vector containing N elements becomes **N-independent tasks**. The objective is to perform N-independent addition tasks (not N sub-problems), and then **coders divide these tasks into threads for parallel execution.**
122 |
123 | - SIMD: The computer automatically divides a large problem (e.g., adding two vectors) into N sub-problems.
124 |
125 | - SIMT: We will be the one to divide the threads to handle these N problems.
126 |
127 |
128 | ## Analyzing the Question
129 |
130 | Our task is to **print "hello world" 10 times**, and since it's **SIMT, We will manually allocate threads** for this printing operation. Here, I distribute threads into two types: **<<<1,10>>>** and **<<<2,5>>>**. Since there are **only 10 threads, these two methods are equivalent.** However, if the task were to **print "hello world" 64 times** and is represented as **<<<1,64>>>** and **<<<2,32>>>**, there would be **a difference** (because at each moment within a block, only 32 warps are executed). Therefore, for **<<<1,64>>>**, it would take 2-time units to complete the printing of "hello world" 64 times, whereas for **<<<2,32>>>**, it would only take 1-time unit to complete.
131 |
132 | I provided a more detailed explanation in the [chapter03](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter03) section under the RULES part
133 |
134 | **In summary, through this article, you now have a clearer understanding of both CPUs and GPUs. Due to the SIMT mechanism, it's crucial to intelligently distribute threads, making it essential to determine the appropriate number of threads per block.**
135 |
--------------------------------------------------------------------------------
/Chapter09/README.md:
--------------------------------------------------------------------------------
1 |
2 | In this article, I will introduce Unified memory - it can be said that Unified memory is a major breakthrough during the cuda 6.0 era.
3 |
4 |
5 |
Unified memory
6 |
7 |
8 | Unified Memory is a special type of memory (located on the CPU) that can be directly accessed by both the CPU and GPU without the need to copy data back and forth between two separate memory types.
9 |
10 | This is why Unified Memory is called based on the zero-copy principle.
11 |
12 |
13 |
14 |
15 |
16 | As I have mentioned, when talking about memory, there are always two concepts: Physical memory and Virtual memory. Unified memory has different perspectives in these two concepts:
17 |
18 | - Virtual memory (developer view): From this perspective, Unified Memory is a unified memory between CPU and GPU (where both CPU and GPU can directly access it).
19 |
20 | - Physical memory (computer view): As I have mentioned, the CPU and GPU have separate memories and cannot directly access each other (only through PCI). Here, Unified Memory is located on the CPU, but thanks to the zero-copy mechanism, we perceive Unified Memory as a unified memory of CPU and GPU.
21 |
22 | Zero Copy: This is a data transfer optimization method, where data is AUTOMATICALLY transferred directly from the memory of one device (e.g., CPU) to another device (e.g., GPU) without going through an intermediate step (like a buffer). This significantly reduces the time and resources needed for data copying, thereby improving performance.
23 |
24 |
25 |
26 |
27 |
28 |
29 |
Summary
30 |
31 |
32 | Unified Memory is a special type of memory that, when used, eliminates the need for us to worry about the process of copying from host to device (h2d) or device to host (d2h), as these tasks are handled automatically by the computer. This makes memory management easier for us. However, due to its automatic nature, it is not optimized and can lead to issues known as page faults.
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
Page faults
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 | Page faults occur when the CPU or GPU requests access to certain data in its memory, but that data has not yet been loaded from Unified Memory.
49 |
50 | In simple terms, Unified Memory can be understood as an intermediary memory between the CPU and GPU. When there's a data change in Unified Memory, this change is simultaneously reflected in both CPU and GPU (based on a mapping mechanism). However, we cannot predict when this data will be mapped back to the CPU and GPU, leading to page faults (the requested data is not found).
51 |
52 | When a page fault occurs, the computer implements the Memory Management Unit (MMU) mechanism: the device sends a page fault request to the MMU to check whether the data exists or not, and if it does, it will be loaded.
53 |
54 | Thus, each time a page fault occurs, a significant amount of time is consumed for the MMU to locate the data.
55 |
56 | It's important to note that page faults only occur when using the zero-copy mechanism in general and Unified Memory in particular. Conventional methods like cudaMemcpy do not experience page faults because, in these cases, we specify that data should be completely copied before processing, similar to following a step-by-step sequence.
57 |
58 |
59 |
Code
60 |
61 |
62 | ```
63 | #include
64 | #include
65 | #include
66 |
67 | using std::cout;
68 |
69 |
70 | __global__ void vectorAdd(int *a, int *b, int *c, int N)
71 | {
72 |
73 | int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
74 |
75 | if (tid < N)
76 | {
77 | c[tid] = a[tid] + b[tid];
78 | }
79 | }
80 |
81 | int main()
82 | {
83 |
84 | const int N = 1 << 16;
85 | size_t bytes = N * sizeof(int);
86 |
87 | int *a, *b, *c;
88 |
89 | cudaMallocManaged(&a, bytes);
90 | cudaMallocManaged(&b, bytes);
91 | cudaMallocManaged(&c, bytes);
92 |
93 |
94 |
95 |
96 | for (int i = 0; i < N; i++)
97 | {
98 | a[i] = rand() % 100;
99 | b[i] = rand() % 100;
100 | }
101 |
102 | int BLOCK_SIZE = 1 << 10;
103 |
104 | int GRID_SIZE = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
105 |
106 | vectorAdd<<>>(a, b, c, N);
107 |
108 | cudaDeviceSynchronize();
109 |
110 | for (int i = 0; i < N; i++)
111 | {
112 | assert(c[i] == a[i] + b[i]);
113 | }
114 |
115 |
116 | cudaFree(a);
117 | cudaFree(b);
118 | cudaFree(c);
119 |
120 | cout << "COMPLETED SUCCESSFULLY!\n";
121 |
122 | return 0;
123 | }
124 | ```
125 |
126 | This is a simple code example for adding two vectors using Unified Memory. As you can see, the process of copying from host to device (h2d) and device to host (d2h) has been omitted. Instead, the integers a, b, c are stored in Unified Memory using cudaMallocManaged. As mentioned earlier, we need cudaDeviceSynchronize() to synchronize the CPU and GPU after the zero-copy process. However, this code will experience page faults.
127 |
128 | To check for and address page faults, you can follow these steps:
129 |
130 | $nvcc .cu
131 |
132 | $./a.out
133 |
134 | $nsys nvprof ./a.out (Please note that to run this command, you need to have Nsight Systems installed. I have written a guide on how to install it in a [separate article](https://github.com/CisMine/Guide-NVIDIA-Tools).)
135 |
136 | 
137 |
138 | It's clear that in the profiling output, there are 18 instances of device-to-host (d2h) copies and 46 instances of host-to-device (h2d) copies, which is a significant number. This indicates that page faults have occurred due to the zero-copy mechanism used in Unified Memory.
139 |
140 |
141 |
Fix
142 |
143 |
144 | ```
145 | #include
146 | #include
147 | #include
148 |
149 | using std::cout;
150 |
151 |
152 | __global__ void vectorAdd(int *a, int *b, int *c, int N)
153 | {
154 |
155 | int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
156 |
157 | if (tid < N)
158 | {
159 | c[tid] = a[tid] + b[tid];
160 | }
161 | }
162 |
163 | int main()
164 | {
165 | const int N = 1 << 16;
166 | size_t bytes = N * sizeof(int);
167 |
168 | int *a, *b, *c;
169 |
170 | cudaMallocManaged(&a, bytes);
171 | cudaMallocManaged(&b, bytes);
172 | cudaMallocManaged(&c, bytes);
173 |
174 | // Get the device ID for prefetching calls
175 | int id = cudaGetDevice(&id);
176 |
177 | // Set some hints about the data and do some prefetching
178 | cudaMemAdvise(a, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
179 | cudaMemAdvise(b, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
180 | cudaMemPrefetchAsync(c, bytes, id);
181 |
182 | // Initialize vectors
183 | for (int i = 0; i < N; i++)
184 | {
185 | a[i] = rand() % 100;
186 | b[i] = rand() % 100;
187 | }
188 |
189 | // Pre-fetch 'a' and 'b' arrays to the specified device (GPU)
190 | cudaMemAdvise(a, bytes, cudaMemAdviseSetReadMostly, id);
191 | cudaMemAdvise(b, bytes, cudaMemAdviseSetReadMostly, id);
192 | cudaMemPrefetchAsync(a, bytes, id);
193 | cudaMemPrefetchAsync(b, bytes, id);
194 |
195 |
196 | int BLOCK_SIZE = 1 << 10;
197 |
198 | int GRID_SIZE = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
199 |
200 | vectorAdd<<>>(a, b, c, N);
201 |
202 |
203 | cudaDeviceSynchronize();
204 |
205 |
206 | // Prefetch to the host (CPU)
207 | cudaMemPrefetchAsync(a, bytes, cudaCpuDeviceId);
208 | cudaMemPrefetchAsync(b, bytes, cudaCpuDeviceId);
209 | cudaMemPrefetchAsync(c, bytes, cudaCpuDeviceId);
210 |
211 | // Verify the result on the CPU
212 | for (int i = 0; i < N; i++)
213 | {
214 | assert(c[i] == a[i] + b[i]);
215 | }
216 |
217 | // Free unified memory (same as memory allocated with cudaMalloc)
218 | cudaFree(a);
219 | cudaFree(b);
220 | cudaFree(c);
221 |
222 | cout << "COMPLETED SUCCESSFULLY!\n";
223 |
224 | return 0;
225 | }
226 | ```
227 | run again to see the result
228 |
229 | $nvcc .cu
230 |
231 | $./a.out
232 |
233 | $nsys nvprof ./a.out
234 |
235 | 
236 |
237 |
238 |
239 |
Explain
240 |
241 |
242 | some special functions that can be used to optimize memory management in CUDA, particularly when using Unified Memory. These functions include:
243 |
244 | - cudaMemAdvise: This function provides hints on how to manage memory on the CPU or GPU. The hints offered by cudaMemAdvise include:
245 |
246 | - cudaMemAdviseSetReadMostly: Suggests that the memory region will be read frequently.
247 | - cudaMemAdviseUnsetReadMostly: Indicates that the previous read-mostly advice no longer applies.
248 | - cudaMemAdviseSetPreferredLocation: Suggests that the memory region should be located on a specific GPU device.
249 | - cudaMemAdviseUnsetPreferredLocation: Indicates that the previous preferred location advice no longer applies.
250 | - cudaMemAdviseSetAccessedBy: Suggests that the memory region will be accessed by one or more GPU devices.
251 | - cudaMemAdviseUnsetAccessedBy: Indicates that the previous accessed-by advice no longer applies.
252 |
253 | - cudaMemPrefetchAsync: This function is used to prefetch data from a memory region on the host or device to another region on the device or host. It allows explicit control of the data prefetching process to optimize performance and efficient data access on the GPU.
254 |
255 |
256 |
257 |
Exercise
258 |
259 |
260 | write a simple code to demonstrate Unified memory can be accessed by both GPU and CPU
261 |
--------------------------------------------------------------------------------
/Chapter04/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Hello world cuda-C
3 |
4 |
5 | Parallel programming on the GPU means that we transfer data from the CPU to the GPU for processing/computation by using the Cuda C/C++ language.
6 |
7 | Most of you may have two questions at this point:
8 |
9 | - What is Cuda?
10 | - How can we transfer data from the CPU to the GPU and utilize GPU cores?
11 |
12 | In this chapter, you will learn about all these concepts and how to implement a simple "Hello World" code in CUDA C.
13 |
14 | #### One small note is that if you are not familiar with GPUs (their functioning or components), don't worry because this chapter will not require that knowledge. Rest assured that I will create a separate chapter to explain GPUs so that readers can acquire the necessary knowledge.
15 |
16 |
17 |
What is Cuda?
18 |
19 |
20 | CUDA (Compute Unified Device Architecture) is a parallel computing platform developed by NVIDIA. It allows programmers to utilize the GPU (Graphics Processing Unit - GPU cores) for performing computational tasks using programming languages such as C and C++.
21 |
22 | ### How Cuda works?
23 |
24 |
25 |
26 |
27 |
28 | When we finish coding and save the file, we often add a file extension at the end. For example:
29 |
30 | - Python uses .py
31 | - C uses .c
32 | - C++ uses .cpp
33 | - Similarly, for CUDA C/C++, the file extension is .cu.
34 |
35 | As the name suggests, CUDA C/C++ code is a combination of C (or C++) and CUDA, so we need a compiler that can compile both C/C++ binaries and CUDA binaries. To address this, NVIDIA has developed NVCC (NVIDIA CUDA Compiler), which can handle both types of code and compile them appropriately.
36 |
37 |
38 | ### NVCC (NVIDIA CUDA Compiler) is a compiler specifically designed to compile CUDA C/C++ code. It plays a crucial role in the compilation process, as it performs several important tasks to generate executable code for NVIDIA GPUs. Here is an overview of how NVCC works:
39 |
40 | - Code Analysis: NVCC analyzes the source code to determine the portions written in CUDA C/C++ and identifies the device (GPU) and host (CPU) code sections.
41 |
42 | - Separation of Host and Device Code: NVCC separates the host code, which runs on the CPU, from the device code, which will be executed on the GPU. It ensures that the host and device sections are handled appropriately during the compilation process.
43 |
44 | - Compilation and Optimization: NVCC compiles the host code using a standard CPU compiler, such as GCC or MSVC, while it compiles the device code using the CUDA compiler provided by NVIDIA. The device code is optimized specifically for NVIDIA GPUs, taking advantage of their architecture and capabilities.
45 |
46 | - GPU-specific Code Generation: NVCC generates GPU-specific machine code (PTX - Parallel Thread Execution) that represents the device code. This code is not directly executable on the GPU but serves as an intermediate representation.
47 |
48 | - PTX Translation and Optimization: NVCC translates the PTX code into GPU-specific machine code (SASS - Scalable Assembly) using the NVIDIA GPU driver. It performs additional optimizations tailored to the target GPU architecture.
49 |
50 | - Linking and Final Binary Generation: NVCC combines the compiled host code and the translated GPU machine code, performs linking, and generates a final executable binary file that can be executed on the target GPU.
51 |
52 | ### By providing a unified compilation process for both host and device code, NVCC simplifies the development of CUDA applications and enables efficient utilization of GPU resources.
53 |
54 |
55 |
56 |
How can we transfer data from the CPU to the GPU and utilize GPU cores?
57 |
58 |
59 |
60 | **In summary, you can envision the process as follows:** First, we write code in C or C++ to fetch data and store it in CPU memory. Then, from the CPU, we call a kernel (a function that runs on the GPU, written in CUDA) to **copy** the data from CPU memory to GPU memory for computation. After the computation is completed, we **copy** the results back from the GPU to the CPU to print the output.
61 |
62 | #### One small note is that from now on, I will refer to it as CUDA C instead of CUDA C/C++. As mentioned earlier, we initially write code in C or C++ to fetch data and store it in CPU memory. Here, I will choose to code in C because it shares similar syntax with CUDA, making it easier to read the code.
63 |
64 | ### Why copy:
65 | The reason for using the term "copy" is that the CPU and GPU have separate memory spaces (**I will dedicate a separate chapter to discuss this in more detail**). They cannot directly access each other's memory, so data transfer between the CPU and GPU needs to occur through the PCI (bus).
66 |
67 | ### Let's run the initial lines of code together and analyze them.
68 |
69 | ```sh
70 | #include
71 |
72 |
73 | __global__ void kernel()
74 | {
75 |
76 | printf("hello world");
77 | }
78 |
79 | int main()
80 | {
81 | kernel<<<1,1>>>();
82 | cudaDeviceSynchronize();
83 |
84 | return 0;
85 | }
86 | ```
87 |
88 | As I have explained how CUDA works, you can save the file in the format **>cu**, and then compile it using two command lines (when compiling, open the terminal and navigate to the correct directory where you have saved the code):
89 | - nvcc >cu
90 | - ./a.out
91 |
92 | 
93 |
94 | # Code analysis
95 |
96 |
97 |
98 |
99 |
100 | Here, we have two new concepts: **Host**, which refers to the **CPU**, and **Device**, which refers to the **GPU**.
101 | - **__ _host_ __**: It represents a normal function that is **called and executed on the CPU**. In other words, when you create a function without any additional specifications, it will be executed on the CPU.
102 |
103 | ```sh
104 | int add(int x, int y) __host__ int add(int x, int y)
105 | { {
106 | return x + y ; return x + y ;
107 | } }
108 | ```
109 | For example, in the two code snippets mentioned above, they are the same. If the execution target of a function is not specified, it defaults to the CPU (i.e., the host). This is especially evident when we create the main function: int main().
110 |
111 | - **__ _global_ __ void:** It represents a function that is **called by the host (CPU)** and **executed by the device (GPU)**. This type of function is often referred to as a **kernel function.**
112 |
113 | **Kernel function: It executes instructions on the GPU. The CPU launches the kernel using a special syntax (as explained earlier with NVCC) to inform the GPU about the number of threads to be used.**
114 |
115 | `I will provide a clear explanation of the meaning of this statement below, and please note that global void always goes together, meaning it does not have a return value. The reason is that the CPU and GPU are separate components that cannot communicate directly with each other. Therefore, the results cannot be returned to the CPU like regular functions. Instead, data needs to be copied back and forth between them through the PCI bus.`
116 |
117 |
118 | ```sh
119 | int add(int x, int y) __global__ void kernelAdd(int a, int x, int y)
120 | { {
121 |
122 | return x + y ; a = x + y ;
123 | } }
124 | ```
125 |
126 | Here, we have two functions: **add** and **kernelAdd.**
127 |
128 | 1) add: It is called and executed on the CPU, meaning the calculation x + y will be performed by a CPU core.
129 |
130 | 2) kernelAdd: It is called by the CPU but executed on the GPU, meaning the calculation x + y will be performed by a GPU core.
131 |
132 |
133 | - **__ _device_ __ >:** It represents a function that is **called by the device (GPU) and executed on the device.** In simple terms, **__ _global_ __ void** can be thought of as the main function on the GPU, while **__ _device_ __ >** is a subsidiary function. These subsidiary functions are often created and called by the main function, which is why device functions are called and executed by the GPU.
134 |
135 |
136 | ```sh
137 |
138 | __device__ void PrintHello()
139 | {
140 | printf("hello");
141 | }
142 |
143 | __global__ void kernel()
144 | {
145 | PrintHello();
146 | }
147 |
148 | ```
149 |
150 | # Back to our code
151 |
152 |
153 | ```sh
154 | #include
155 |
156 |
157 | __global__ void kernel()
158 | {
159 |
160 | printf("hello world");
161 | }
162 |
163 | int main()
164 | {
165 | kernel<<<1,1>>>();
166 | cudaDeviceSynchronize();
167 |
168 | return 0;
169 | }
170 | ```
171 |
172 |
173 | So here, we create a kernel function to print "hello world" (which is executed by GPU cores), and we call this kernel function in the main function (CPU). There are two things we need to explain:
174 | - <<<1,1>>>: The **first "1"** represents the **number of blocks**, and the **second "1"** represents the **number of threads within a block.** I have explained blocks and threads earlier, but here it is a bit different from the theoretical explanation. As I mentioned before, threads represent the number of tasks, but in this case, threads actually refer to the number of GPU cores. So, in this context, threads are equivalent to GPU cores. **In this case, we specify how many "students" (GPU cores or SP) will perform the task of printing "hello world"** ==> **by using <<<1,1>>>.** It means that there is **one "class" (block)** with **one "student" (thread)** performing the task of printing "hello world". In general, **<<>>** means that there will be **N "classes" with N "students"** performing the task of printing "hello world" ==> hello world being printed N * N times.
175 |
176 | **Therefore, we can specify how many GPU cores (or threads) are used for execution. This leads to the statement:**
177 |
178 | **Kernel function: It executes instructions on the GPU. The CPU launches the kernel with special syntax (as explained earlier with NVCC) to inform the GPU about the number of threads to be used.**
179 |
180 | - Since the CPU and GPU are separate components with different processing speeds, we need synchronization between the two components. Hence, NVIDIA introduced **cudaDeviceSynchronize()**, which is a synchronization function. It ensures that all preceding computational tasks on the GPU are completed before the program proceeds to execute subsequent tasks on the CPU.
181 |
182 | # Exercises
183 | 1) You can try creating **__ _device_ __ functions** and calling them from **__ _global_ __ void**, and then call **__ _global_ __ void** from a **__ _host_ __** function and call that **__ _host_ __** function in the main function. Additionally, you can experiment with changing the order of the function calls to observe their impact. For example, try calling **__ _global_ __ void** inside **__ _device_ __**.
184 |
185 | ```sh
186 | #include
187 |
188 | __device__ void Device1()
189 | {
190 | //
191 | }
192 |
193 | __device__ void Device2()
194 | {
195 | //
196 | }
197 |
198 | __global__ void kernel()
199 | {
200 | Device1();
201 | Device2();
202 | }
203 |
204 | void sub_Function_in_Host()
205 | {
206 | kernel<<<1, 1>>>();
207 | cudaDeviceSynchronize();
208 | }
209 |
210 | int main()
211 | {
212 | sub_Function_in_Host();
213 | return 0;
214 | }
215 | ```
216 |
217 | 2) You can try running any line of codes within the main function **before** and **after** using cudaDeviceSynchronize() to observe the output. See if it matches the theoretical expectations we discussed earlier.
218 |
219 | 3) In theory, as mentioned earlier, <<<1,1>>> indicates that there is one "class" with one "student" performing the task of printing "hello world" If we change it to:
220 |
221 | <<<1,10>>>: Now, there will be 10 "students" in one "class" performing the task of printing "hello world" concurrently.
222 |
223 | <<<2,5>>>: In this case, there will be 2 "classes" with 5 "students" in each class, performing the task of printing "hello world" concurrently.
224 |
225 | Both approaches will output "hello world" ten times, but what is the difference?
226 |
227 | Here are two hints:
228 | - It is related to SIMT so What is SIMT?
229 | - I mentioned in [chapter03](https://github.com/CisMine/Parallel-Computing-Cuda-C/edit/main/Chapter03/README.md)
230 |
--------------------------------------------------------------------------------
/Chapter07/README.md:
--------------------------------------------------------------------------------
1 | In [Chapter06](https://github.com/CisMine/Parallel-Computing-Cuda-C/tree/main/Chapter06), I introduced the different types of memory within the GPU (their functions, speeds, and the thread access scope). In this lesson, I will guide you on how to use them using the CUDA-C language.
2 |
3 |
4 |
Using GPU memory
5 |
6 |
7 | Before diving into the code, I will answer the two questions that I mentioned in Chapter06, which are:
8 | - In the diagram, why are shared memory and L1 cache combined into a single memory instead of being separate memories?
9 |
10 | - Why does the access scope of L1 involve Threads within the same block, while L2 involves all Threads across the device?
11 |
12 | 
13 |
14 | 
15 |
16 | If our perspective is the **Physical view**, then it is certain that **shared memory and L1 are two separate memory sticks**. My question is from the **Logical view**, so they will be **one**. The reason is that, as analyzed by researchers, if separated into distinct memory regions, it will be **challenging to manage and will be very resource-intensive**.
17 |
18 | ```
19 | If you are thinking right here:
20 |
21 | - Difficult to manage?? If they are merged, it may lead to confusion between memories, and separating them will be easier to manage. That's incorrect.
22 |
23 | - Resource-intensive?? Whether they are merged or separated, we only have that much memory. Where is the waste of resources??
24 | ```
25 |
26 | If you all have thoughts like that, then I would like to mention the concept of **prefetching** (extremely effective in optimizing data access performance and applied to Caches).
27 |
28 | **Prefetching** is the process of loading data from main memory into cache or intermediate memory **before it is needed** to optimize data access performance
29 |
30 | Example:
31 |
32 | ```
33 | int a[100], b[100], c[100];
34 | for(int i = 0; i < 100; i++) {
35 | c[i] = a[i] + b[i];
36 | }
37 |
38 | ```
39 |
40 | In the code snippet above, we are performing addition between two arrays, 'a' and 'b,' to store the result in array 'c.' However, **in the typical access pattern**, the program would iterate through each element one by one, and with each access, it would adjust the pointer to main memory to fetch data from 'a' and 'b.' This can lead to waiting times when accessing data from main memory.
41 |
42 | **The prefetching mechanism** suggests an intelligent way to reduce this waiting time. Instead of waiting until each element is individually accessed, it anticipates the **upcoming data accesses** that the program may perform **based on previous access patterns**. It then prefetches these data into the cache memory, helping to reduce waiting times and optimize the overall program performance.
43 |
44 | `This is why the cache is also referred to as temporary memory. When we push data into the cache, it is only used temporarily until that data is actually accessed. At that point, the data in the cache is fetched for processing, and the cache memory is replaced with the next set of data.`
45 |
46 | Returning to our original issue of **'Difficult to manage - Resource-intensive'**:
47 |
48 | - Difficult to manage: If shared memory and L1 were separate, we would need an additional mapping step to determine which data will be accessed next. When they are combined within a single memory mechanism, data can be efficiently shared between shared memory and L1 cache.
49 |
50 | - Resource-intensive: According to researchers, if we separate them into two separate memory sticks, when implementing code, most of the time, shared memory or cache would not be fully utilized (there would be some leftover space). However, if we combine them, we can allocate exactly how much we need for shared memory, and the rest can be allocated to the cache, making it more efficient.
51 |
52 | `For these reasons, we only need to use shared memory without needing to touch the cache (and in reality, we cannot directly manipulate the cache as NVIDIA does not provide libraries for direct cache operations).`
53 |
54 | As for the reason why the access scope of L1 is within the threads of the same block while that of L2 encompasses all threads, it is because **global memory also requires a prefetch mechanism**, which forces us to separate the cache into two parts: **one for shared (L1) and one for global (L2).**
55 |
56 |
57 |
58 |
59 |
Now, let's move on to the code
60 |
61 |
62 |
63 |
Global memory
64 |
65 |
66 | We will code the addition of two vectors (each with 100 elements) using global memory - the largest and slowest memory in the GPU.
67 |
68 | ```
69 | h_: represents values on the host.
70 |
71 | d_: represents values on the device.
72 |
73 | The symbols h_ and d_ are commonly used in CUDA guides and documents, so I will use them here to make them familiar to everyone.
74 | ```
75 |
76 | ```sh
77 | #include
78 | #include
79 |
80 | // Size of the vector
81 | #define N 100
82 |
83 | // CUDA kernel to add two vectors
84 | __global__ void vectorAdd(int *a, int *b, int *c) {
85 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
86 | if (tid < N) {
87 | c[tid] = a[tid] + b[tid];
88 | }
89 | }
90 |
91 | int main() {
92 | int *h_a, *h_b, *h_c; // Host vectors
93 | int *d_a, *d_b, *d_c; // Device vectors
94 |
95 | // Initialize host vectors
96 | h_a = (int *)malloc(N * sizeof(int));
97 | h_b = (int *)malloc(N * sizeof(int));
98 | h_c = (int *)malloc(N * sizeof(int));
99 |
100 | // Initialize host vectors with random values
101 | for (int i = 0; i < N; i++) {
102 | h_a[i] = rand() % 10;
103 | h_b[i] = rand() % 10;
104 | }
105 |
106 | // Allocate device memory for vectors
107 | cudaMalloc((void **)&d_a, N * sizeof(int));
108 | cudaMalloc((void **)&d_b, N * sizeof(int));
109 | cudaMalloc((void **)&d_c, N * sizeof(int));
110 |
111 | // Copy data from CPU to GPU
112 | cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
113 | cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
114 |
115 | // Call the CUDA kernel to perform vector addition
116 | vectorAdd<<<2, 50>>>(d_a, d_b, d_c);
117 |
118 | // Copy the result from GPU to CPU
119 | cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
120 |
121 | // Print the result
122 | for (int i = 0; i < N; i++) {
123 | printf("h_a[%d] %d + h_b[%d] %d = %d\n", i, h_a[i], i, h_b[i], h_c[i]);
124 | }
125 |
126 | // Free memory
127 | free(h_a);
128 | free(h_b);
129 | free(h_c);
130 | cudaFree(d_a);
131 | cudaFree(d_b);
132 | cudaFree(d_c);
133 |
134 | return 0;
135 | }
136 | ```
137 |
138 |
Analyze the kernel
139 |
140 |
141 | ```sh
142 | __global__ void vectorAdd(int *a, int *b, int *c) {
143 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
144 | if (tid < N) {
145 | c[tid] = a[tid] + b[tid];
146 | }
147 | }
148 | ```
149 | This kernel is quite similar to regular C code, with a few differences:
150 | - int tid = blockIdx.x * blockDim.x + threadIdx.x;
151 | - if (tid < N)
152 |
153 | In regular C code, to add two vectors, we would iterate through each element and add them individually. However, in CUDA-C, we iterate through all elements in one go, but to do this, we need to determine the index (the position of the elements). So, **int tid is used to determine the index.**
154 |
155 | 
156 |
157 | 
158 |
159 | In the two illustrated images, the concept is quite clear. In this context, **'M' represents the number of threads in one block, also known as 'blockDim.'**
160 |
161 | Next is the **'if (tid < N)'** statement, which serves as a barrier to determine which threads participate in the process of adding two vectors. In the given code, we only need to use 100 threads for the 100 elements. However, GPUs typically have up to 1024 threads per block, so if we don't specify the 'if' condition, it might not work properly.
162 |
163 | `In fact, specifying 'if' is not always necessary because in 'vectorAdd<<<2, 50>>>' we've already determined that only 100 threads will be used. However, as the code becomes more complex, including the 'if' condition as a habit is recommended for better code management.`
164 |
165 |
166 |
Initialization of values on the GPU
167 |
168 |
169 | ```
170 | cudaMalloc((void **)&d_a, N * sizeof(int));
171 | cudaMalloc((void **)&d_b, N * sizeof(int));
172 | cudaMalloc((void **)&d_c, N * sizeof(int));
173 | ```
174 | The cudaMalloc function is similar to malloc in C, but it is used to dynamically allocate memory on the GPU.
175 |
176 | A small note is that when you read various guides or documentation, you might see the following syntax:
177 |
178 | ```
179 | cudaMalloc(&d_a, N * sizeof(int));
180 | cudaMalloc(&d_b, N * sizeof(int));
181 | cudaMalloc(&d_c, N * sizeof(int));
182 | ```
183 |
184 | Both code snippets are equivalent. The version with void is an older style, while the latter is more modern and commonly used.
185 |
186 |
187 |
188 |
Data Transfer
189 |
190 |
191 | Here I use the word Transfer data to indicate copying data from Host to Device and vice versa (and later I will abbreviate it as H2D - D2H).
192 |
193 | ```
194 | cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
195 | cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
196 |
197 | vectorAdd<<<2, 50>>>(d_a, d_b, d_c);
198 |
199 | cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
200 | ```
201 |
202 | The cudaMemcpy function is used to transfer data between the host (CPU) and the device (GPU). It takes three parameters:
203 | - first parameter: Destination (where to paste data - it likes Ctrl+v)
204 | - second parameter: Source (what to copy - it likes Ctrl+c)
205 | - third parameter: Direction of data transfer (H2D for Host to Device or D2H for Device to Host)
206 |
207 | **An important note is that when you use cudaMemcpy, data is automatically copied to/from the global memory.**
208 |
209 |
210 |
Local memory and registers
211 |
212 |
213 | Local memory and register files are two distinct memory types for each thread. Register files are the fastest memory available for each thread. When kernel variables cannot fit in register files, they will use local memory.
214 |
215 | In other words, each thread has its own set of register files and local memory. When register files are exhausted, data is spilled into local memory. This concept is known as **register spilling.**
216 |
217 | ```sh
218 | #include
219 | #include
220 |
221 | __global__ void kernel() {
222 | int temp = 0;
223 | temp = threadIdx.x;
224 |
225 | printf("blockId %d ThreadIdx %d = %d\n", blockIdx.x, threadIdx.x, temp);
226 | }
227 |
228 | int main() {
229 | kernel<<<5, 5>>>();
230 | cudaDeviceSynchronize();
231 |
232 | return 0;
233 | }
234 | ```
235 | In the provided code, each thread initializes its temp variable with threadIdx.x. **While conventional thinking might suggest that temp would hold the last value of threadIdx.x**, CUDA works differently. **Each thread executes independently, so temp is a thread-local variable. Therefore, temp will indeed contain the value of threadIdx.x for each thread.**
236 |
237 | If you revisit the global memory example:
238 | - int tid = blockIdx.x * blockDim.x + threadIdx.x;
239 |
240 | The **int tid** variable is local memory.
241 |
242 |
243 |
Constant Memory
244 |
245 |
246 | Constant memory is read-only and is used to store constant values. The example code demonstrates how to use constant memory for a simple equation, y = 3x + 5, with x as a vector and 3 and 5 as constants stored in constant memory.
247 |
248 | ```sh
249 | #include
250 |
251 | __constant__ int constantData[2]; // Declaration of Constant memory array
252 |
253 | __global__ void kernel(int *d_x, int *d_y, int N) {
254 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
255 |
256 | if (idx < N) {
257 | int x = d_x[idx];
258 | int a = constantData[0]; // Retrieve the value 3 from Constant memory
259 | int b = constantData[1]; // Retrieve the value 5 from Constant memory
260 | d_y[idx] = a * x + b;
261 | }
262 | }
263 |
264 | int main() {
265 | const int N = 10; // Number of array elements
266 | int h_x[N]; // Input array on the host
267 | int h_y[N]; // Result array on the host
268 | int *d_x, *d_y; // Arrays on the device
269 |
270 | // Initialize data on the host
271 | for (int i = 0; i < N; i++) {
272 | h_x[i] = i;
273 | }
274 |
275 | // Allocate memory for arrays on the GPU
276 | cudaMalloc((void**)&d_x, N * sizeof(int));
277 | cudaMalloc((void**)&d_y, N * sizeof(int));
278 |
279 | // Copy data from host to device
280 | cudaMemcpy(d_x, h_x, N * sizeof(int), cudaMemcpyHostToDevice);
281 |
282 | // Copy the values 3 and 5 into Constant memory
283 | int constantValues[2] = {3, 5};
284 | cudaMemcpyToSymbol(constantData, constantValues, 2 * sizeof(int));
285 |
286 | // Launch the kernel with 1 block and N threads
287 | kernel<<<1, N>>>(d_x, d_y, N);
288 | cudaDeviceSynchronize();
289 |
290 | // Copy the results from the device to the host
291 | cudaMemcpy(h_y, d_y, N * sizeof(int), cudaMemcpyDeviceToHost);
292 |
293 | // Print the results
294 | for (int i = 0; i < N; i++) {
295 | printf("3(x= %d) + 5 => y = %d\n", h_x[i], h_y[i]);
296 | }
297 |
298 | // Free memory on the device
299 | cudaFree(d_x);
300 | cudaFree(d_y);
301 |
302 | return 0;
303 | }
304 | ```
305 |
306 |
Initialization of values on the GPU
307 |
308 |
309 | Unlike global memory, constant memory does not require using cudaMalloc. Instead, you need to declare that you are using constant memory by using _ _ _constant_ _ _
310 | - _ _ _constant_ _ _ int constantData[2]
311 |
312 |
313 |
Transfer data
314 |
315 |
316 | `cudaMemcpyToSymbol(constantData, constantValues, 2 * sizeof(int))`
317 |
318 | Data transfer to constant memory is done using cudaMemcpyToSymbol.
319 |
320 | A small note: In the code, the direction of data transfer is not explicitly specified because it defaults to H2D (Host to Device).
321 |
322 |
323 |
Summary
324 |
325 |
326 | In summary, this tutorial covers the use of global memory, local memory (register files), and constant memory in CUDA programming. Shared memory will be discussed separately in another tutorial, and texture memory is no longer a significant concern in modern NVIDIA GPUs.
327 |
328 |
329 |
Exercises
330 |
331 |
332 | When you run the code in the example of local memory and registers:
333 |
334 | ```sh
335 | #include
336 | #include
337 |
338 | __global__ void kernel() {
339 | int temp = 0;
340 | temp = threadIdx.x;
341 |
342 | printf("blockId %d ThreadIdx %d = %d\n", blockIdx.x, threadIdx.x, temp);
343 |
344 | }
345 |
346 | int main() {
347 | kernel<<<5, 5>>>();
348 | cudaDeviceSynchronize();
349 |
350 | return 0;
351 | }
352 | ```
353 | Why does the output not follow the order of blockId but appear mixed up as shown in the figure?
354 |
355 | 
356 |
357 |
358 |
359 |
360 |
--------------------------------------------------------------------------------