├── .gitignore ├── CUDA ├── AOSandSOA │ ├── AoS.cu │ └── SoA.cu ├── BFS │ └── bfs.cu ├── ConstantMemory │ └── constantStencil.cu ├── Instruction │ ├── atomic-ordering.cu │ ├── floating-point-accuracy.cu │ ├── floating-point-perf.cu │ ├── fmad.cu │ ├── intrinsic-standard-comp.cu │ └── my-atomic-add.cu ├── MatrixTranspose │ └── transpose.cu ├── NVIDIA_Online_Training │ ├── Fundamentals_of_CUDA_C_C++ │ │ ├── 01_hello-gpu.cu │ │ ├── 02_first-parallel.cu │ │ ├── 03_thread-and-block-idx.cu │ │ ├── 04_single-block-loop.cu │ │ ├── 05_multiple-block-loop.cu │ │ ├── 06_double-elements.cu │ │ ├── 07_grid-stride-double.cu │ │ ├── 08_add-error-handling.cu │ │ ├── 09_vector-add.cu │ │ ├── 10_matrix-multiply-2d.cu │ │ ├── 11_get-device-properties.cu │ │ ├── 12_page-faults.cu │ │ ├── 13_print-numbers.cu │ │ ├── 14_n-body.cu │ │ ├── 15_vector-add-manual-alloc.cu │ │ └── 16_vector-add-overlap-xfer.cu │ └── Scaling_Workloads_Across_Multiple_GPUs_with_CUDA_C++ │ │ └── mgpu_with_streams.cu ├── SharedMemory │ ├── reduceInteger.cu │ ├── smemRectangular.cu │ ├── smemSquare.cu │ └── transposeRectangular.cu ├── StreamsAndEvents │ ├── asyncAPI.cu │ ├── simpleCallback.cu │ ├── simpleHyperQBreadth.cu │ ├── simpleHyperQDependece.cu │ ├── simpleHyperQDepth.cu │ ├── simpleMultiAddBreadth.cu │ └── simpleMultiAddDepth.cu ├── UnifiedMemory │ ├── matrixAddWithUnifiedMemory.cu │ └── matrixAddWithoutUnifiedMemory.cu ├── WarpShuffle │ └── simpleShuffle.cu ├── bezierCurves │ ├── bezierCurves.cuh │ ├── bezierCurves1.cu │ └── bezierCurves2.cu ├── common │ ├── common.h │ └── common_string.h ├── convolution1D │ └── conv1D.cu ├── convolution2D │ └── conv2D.cu ├── deviceQuery │ ├── deviceQuery.cu │ └── simpleDeviceQuery.cu ├── histogram │ └── histogram.cu ├── imageProcessing │ ├── convertColorToGrey.cu │ ├── imageBlur.cu │ └── lena.jpg ├── matrixAdd │ ├── matrixAdd.cu │ └── matrixAdd2.cu ├── matrixMul │ └── matrixMul.cu ├── matrixMulTiling │ └── matrixMulTiling.cu ├── mergeSort │ └── mergeOperation.cu ├── prefixSum │ └── prefixSum.cu ├── reduction │ ├── nestedReduce.cu │ ├── reduceInteger.cu │ ├── reduction.cpp │ ├── reduction.h │ └── reductionKernel.cu ├── simpleDivergence │ └── simpleDivergence.cu ├── sparseMatrixVectorMul │ └── SpMV.cu └── vectorAdd │ ├── sumArrayZerocopy.cu │ └── vectorAdd.cu ├── OpenMP ├── 00_omp_hello.c ├── 01_omp_hello_errchk.c ├── 02_omp_trap1.c ├── 03_omp_trap2.c ├── 04_omp_trap3.c ├── 05_omp_trap4.c ├── 06_omp_fibo.c ├── 07_omp_pi.c ├── 08_omp_odd_even1.c ├── 09_omp_odd_even2.c ├── 10_omp_sin_sum.c ├── 11_omp_msg.c ├── 12_omp_mat_vec_mul.c ├── 13_omp_private.c ├── 14_omp_mat_mul.c └── queue │ ├── queue.c │ └── queue.h ├── cblas_mat_mul.c ├── cublas_mat_mul.cu ├── cuda_mat_mul.cu ├── mkl_mat_mul.c ├── mpi ├── 00_mpi_hello.c ├── 01_serial_trap.c ├── 02_mpi_trap1.c ├── 03_mpi_output.c ├── 04_mpi_trap2.c ├── 05_mpi_trap3.c ├── 06_mpi_trap4.c ├── 07_mpi_vec_add.c ├── 08_mpi_mat_vec_mul.c ├── 09_serial_mat_vec_mul_time.c ├── 10_mpi_mat_vec_mul_time.c ├── 11_serial_odd_even_sort.c ├── 12_mpi_odd_even_sort_unsafe.c ├── 13_mpi_odd_even_sort_safe.c └── 14_mpi_mat_mul.c ├── ocv_mat_mul.c └── pthread ├── 00_pth_hello.c ├── 01_pth_mat_vec_mul.c ├── 02_pth_pi.c ├── 03_pth_pi_busy1.c ├── 04_pth_pi_busy2.c ├── 05_pth_pi_mutex.c ├── 06_pth_message.c ├── 07_pth_message_sem.c ├── 08_pth_busy_barrier.c ├── 09_pth_sem_barrier.c ├── 10_pth_cond_barrier.c ├── 11_pth_posix_barrier.c ├── 12_pth_tokenize.c └── 13_pth_mat_mul.c /.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !*.* 3 | !*/ 4 | 5 | .vscode/ 6 | 7 | *.exe 8 | *.exp 9 | *.pdb 10 | *.ncu* 11 | *.lib -------------------------------------------------------------------------------- /CUDA/AOSandSOA/AoS.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: AoS.cu 3 | * Description: This is a simple example of using an array of structures to 4 | * store data on the device. 5 | * 6 | * Compile: nvcc -O3 -o AoS AoS.cu -I.. 7 | * Run: ./AoS [n] 8 | * [n] : the number of threads in a block 9 | *****************************************************************************/ 10 | 11 | #include 12 | #include 13 | #include "common/common.h" 14 | 15 | #define LEN 1 << 20 16 | 17 | struct innerStruct { 18 | float x; 19 | float y; 20 | }; 21 | 22 | void initialInnerStruct(innerStruct* in, const int N) 23 | { 24 | for (int i = 0; i < N; i++) { 25 | in[i].x = (rand() & 0xFF) / 100.f; 26 | in[i].y = (rand() & 0xFF) / 100.f; 27 | } 28 | } 29 | 30 | void testInnerStructHost(innerStruct* data, innerStruct* result, const int N) 31 | { 32 | for (int i = 0; i < N; i++) { 33 | result[i].x = data[i].x + 10.f; 34 | result[i].y = data[i].y + 20.f; 35 | } 36 | } 37 | 38 | void checkInnerStruct(innerStruct* hostRef, innerStruct* gpuRef, const int N) 39 | { 40 | double epsilon = 1.0e-8; 41 | 42 | for (int i = 0; i < N; i++) { 43 | if (abs(hostRef[i].x - gpuRef[i].x) > epsilon) { 44 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i].x, gpuRef[i].x); 45 | printf("Arrays do not match.\n\n"); 46 | 47 | break; 48 | } 49 | if (abs(hostRef[i].y - gpuRef[i].y) > epsilon) { 50 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i].y, gpuRef[i].y); 51 | printf("Arrays do not match.\n\n"); 52 | 53 | break; 54 | } 55 | } 56 | } 57 | 58 | __global__ 59 | void testInnerStruct(innerStruct* data, innerStruct* result, const int N) 60 | { 61 | unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 62 | 63 | if (idx < N) { 64 | innerStruct tmp = data[idx]; 65 | tmp.x += 10.f; 66 | tmp.y += 20.f; 67 | result[idx] = tmp; 68 | } 69 | } 70 | 71 | __global__ 72 | void warmup(innerStruct* data, innerStruct* result, const int N) 73 | { 74 | unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 75 | 76 | if (idx < N) { 77 | innerStruct tmp = data[idx]; 78 | tmp.x += 10.f; 79 | tmp.y += 20.f; 80 | result[idx] = tmp; 81 | } 82 | } 83 | 84 | int main(int argc, char** argv) 85 | { 86 | // setup device 87 | int dev = 0; 88 | cudaDeviceProp deviceProp; 89 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 90 | printf("Test struct of array at device %d: %s\n", dev, deviceProp.name); 91 | CUDA_CHECK(cudaSetDevice(dev)); 92 | 93 | // allocate host memory 94 | int nElem = LEN; 95 | size_t nBytes = nElem * sizeof(innerStruct); 96 | innerStruct *h_A = (innerStruct*)malloc(nBytes); 97 | innerStruct *hostRef = (innerStruct*)malloc(nBytes); 98 | innerStruct *gpuRef = (innerStruct*)malloc(nBytes); 99 | 100 | // initialize host array 101 | initialInnerStruct(h_A, nElem); 102 | testInnerStructHost(h_A, hostRef, nElem); 103 | 104 | // allocate device memory 105 | innerStruct* d_A, *d_C; 106 | CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes)); 107 | CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes)); 108 | 109 | // copy data from host to device 110 | CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 111 | 112 | // setup execution configuration 113 | int threads = 128; 114 | if (argc > 1) 115 | threads = atoi(argv[1]); 116 | 117 | dim3 blocks(threads, 1); 118 | dim3 grids((blocks.x + nElem - 1) / blocks.x, 1); 119 | 120 | double start, finish; 121 | // kernel 1: warmup 122 | GET_TIME(start); 123 | warmup<<>>(d_A, d_C, nElem); 124 | cudaDeviceSynchronize(); 125 | GET_TIME(finish); 126 | //printf("warpup <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start); 127 | CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 128 | checkInnerStruct(hostRef, gpuRef, nElem); 129 | 130 | // kernel 2: testInnerStruct 131 | GET_TIME(start); 132 | testInnerStruct<<>>(d_A, d_C, nElem); 133 | CUDA_CHECK(cudaDeviceSynchronize()); 134 | GET_TIME(finish); 135 | printf("innerstruct <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start); 136 | CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 137 | checkInnerStruct(hostRef, gpuRef, nElem); 138 | 139 | // free memories bost host and device 140 | CUDA_CHECK(cudaFree(d_A)); 141 | CUDA_CHECK(cudaFree(d_C)); 142 | free(h_A); 143 | free(hostRef); 144 | free(gpuRef); 145 | 146 | CUDA_CHECK(cudaDeviceReset()); 147 | return 0; 148 | } -------------------------------------------------------------------------------- /CUDA/AOSandSOA/SoA.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: SoA.cu 3 | * Description: This is a simple example of using a structure of arrays to 4 | * store data on the device. 5 | * 6 | * Compile: nvcc -O3 -o SoA SoA.cu -I.. 7 | * Run: ./SoA [n] 8 | * [n] : the number of threads in a block 9 | *****************************************************************************/ 10 | 11 | #include 12 | #include 13 | #include "common/common.h" 14 | 15 | #define LEN 1 << 20 16 | 17 | struct innerArray { 18 | float x[LEN]; 19 | float y[LEN]; 20 | }; 21 | 22 | void initialInnerArray(innerArray* in, const int N) 23 | { 24 | for (int i = 0; i < N; i++) { 25 | in->x[i] = (rand() & 0xFF) / 100.f; 26 | in->y[i] = (rand() & 0xFF) / 100.f; 27 | } 28 | } 29 | 30 | void testInnerArrayHost(innerArray* data, innerArray* result, const int N) 31 | { 32 | for (int i = 0; i < N; i++) { 33 | result->x[i] = data->x[i] + 10.f; 34 | result->y[i] = data->y[i] + 20.f; 35 | } 36 | } 37 | 38 | void checkInnerArray(innerArray* hostRef, innerArray* gpuRef, const int N) 39 | { 40 | double epsilon = 1.0e-8; 41 | 42 | for (int i = 0; i < N; i++) { 43 | if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon) { 44 | printf("different on %dth element: host %f gpu %f\n", i, hostRef->x[i], gpuRef->x[i]); 45 | printf("Arrays do not match.\n\n"); 46 | 47 | break; 48 | } 49 | if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon) { 50 | printf("different on %dth element: host %f gpu %f\n", i, hostRef->y[i], gpuRef->y[i]); 51 | printf("Arrays do not match.\n\n"); 52 | 53 | break; 54 | } 55 | } 56 | } 57 | 58 | __global__ 59 | void testInnerArray(innerArray* data, innerArray* result, const int N) 60 | { 61 | unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 62 | 63 | if (idx < N) { 64 | float tmpX = data->x[idx]; 65 | float tmpY = data->y[idx]; 66 | 67 | tmpX += 10.f; 68 | tmpY += 20.f; 69 | result->x[idx] = tmpX; 70 | result->y[idx] = tmpY; 71 | } 72 | } 73 | 74 | __global__ 75 | void warmup(innerArray* data, innerArray* result, const int N) 76 | { 77 | unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 78 | 79 | if (idx < N) { 80 | float tmpX = data->x[idx]; 81 | float tmpY = data->y[idx]; 82 | 83 | tmpX += 10.f; 84 | tmpY += 20.f; 85 | result->x[idx] = tmpX; 86 | result->y[idx] = tmpY; 87 | } 88 | } 89 | 90 | int main(int argc, char** argv) 91 | { 92 | // setup device 93 | int dev = 0; 94 | cudaDeviceProp deviceProp; 95 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 96 | printf("Test struct of array at device %d: %s\n", dev, deviceProp.name); 97 | CUDA_CHECK(cudaSetDevice(dev)); 98 | 99 | // allocate host memory 100 | int nElem = LEN; 101 | size_t nBytes = sizeof(innerArray); 102 | innerArray *h_A = (innerArray*)malloc(nBytes); 103 | innerArray *hostRef = (innerArray*)malloc(nBytes); 104 | innerArray *gpuRef = (innerArray*)malloc(nBytes); 105 | 106 | // initialize host array 107 | initialInnerArray(h_A, nElem); 108 | testInnerArrayHost(h_A, hostRef, nElem); 109 | 110 | // allocate device memory 111 | innerArray* d_A, *d_C; 112 | CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes)); 113 | CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes)); 114 | 115 | // copy data from host to device 116 | CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 117 | 118 | // setup execution configuration 119 | int threads = 128; 120 | if (argc > 1) 121 | threads = atoi(argv[1]); 122 | 123 | dim3 blocks(threads, 1); 124 | dim3 grids((blocks.x + nElem - 1) / blocks.x, 1); 125 | 126 | double start, finish; 127 | // kernel 1: warmup 128 | GET_TIME(start); 129 | warmup<<>>(d_A, d_C, nElem); 130 | cudaDeviceSynchronize(); 131 | GET_TIME(finish); 132 | //printf("warpup <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start); 133 | CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 134 | checkInnerArray(hostRef, gpuRef, nElem); 135 | 136 | // kernel 2: testInnerArray 137 | GET_TIME(start); 138 | testInnerArray<<>>(d_A, d_C, nElem); 139 | CUDA_CHECK(cudaDeviceSynchronize()); 140 | GET_TIME(finish); 141 | printf("innerarray <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start); 142 | CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 143 | checkInnerArray(hostRef, gpuRef, nElem); 144 | 145 | // free memories bost host and device 146 | CUDA_CHECK(cudaFree(d_A)); 147 | CUDA_CHECK(cudaFree(d_C)); 148 | free(h_A); 149 | free(hostRef); 150 | free(gpuRef); 151 | 152 | CUDA_CHECK(cudaDeviceReset()); 153 | return 0; 154 | } -------------------------------------------------------------------------------- /CUDA/Instruction/atomic-ordering.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: atomic-ordering.cu 3 | * Description: This is an example to illustrates the difference between using 4 | * atomic operations and using unsafe accesses to increment a shared 5 | * variable. 6 | * In both the atomics() and unsafe() kernels, each thread repeatedly 7 | * increments a globally shared variable by 1. Each thread also stores 8 | * the value is reads from the shared location for the first increment. 9 | * 10 | * Compile: nvcc -o atomic-ordering atomic-ordering.cu -I.. 11 | * Run: ./atomic-ordering 12 | *****************************************************************************/ 13 | #include 14 | #include 15 | #include 16 | #include "common/common.h" 17 | 18 | __global__ 19 | void atomics(int* shared_var, int* values_read, int N, int iters) 20 | { 21 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 22 | 23 | if (tid > N) 24 | return; 25 | 26 | values_read[tid] = atomicAdd(shared_var, 1); 27 | 28 | for (int i = 0; i < iters; i++) 29 | atomicAdd(shared_var, 1); 30 | } 31 | 32 | __global__ 33 | void unsafe(int* shared_var, int* values_read, int N, int iters) 34 | { 35 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 36 | 37 | if (tid > N) 38 | return; 39 | 40 | int old = *shared_var; 41 | *shared_var = old + 1; 42 | values_read[tid] = old; 43 | 44 | for (int i = 0; i < iters; i++) { 45 | int old = *shared_var; 46 | *shared_var = old + 1; 47 | } 48 | } 49 | 50 | void print_read_results(int *h_arr, int *d_arr, int N, const char* label) 51 | { 52 | int maxNumToPrint = 10; 53 | int nToPrint = N > maxNumToPrint ? maxNumToPrint : N; 54 | 55 | CUDA_CHECK(cudaMemcpy(h_arr, d_arr, nToPrint * sizeof(int), cudaMemcpyDeviceToHost)); 56 | printf("Threads performing %s operations read values", label); 57 | 58 | for (int i = 0; i < nToPrint; i++) { 59 | printf(" %d", h_arr[i]); 60 | } 61 | printf("\n"); 62 | } 63 | 64 | int main(int argc, char** argv) 65 | { 66 | int N = 64; 67 | int block = 32; 68 | int runs = 30; 69 | int iters = 100000; 70 | int *d_shared_var; 71 | int h_shared_var_atomic, h_shared_var_unsafe; 72 | int *d_values_read_atomic, *d_values_read_unsafe; 73 | int *h_values_read; 74 | 75 | CUDA_CHECK(cudaMalloc((void**)&d_shared_var, sizeof(int))); 76 | CUDA_CHECK(cudaMalloc((void**)&d_values_read_atomic, N * sizeof(int))); 77 | CUDA_CHECK(cudaMalloc((void**)&d_values_read_unsafe, N * sizeof(int))); 78 | h_values_read = (int*)malloc(N * sizeof(int)); 79 | 80 | double atomic_mean_time = 0; 81 | double unsafe_mean_time = 0; 82 | 83 | for (int r = 0; r < runs; r++) { 84 | double start, stop; 85 | GET_TIME(start); 86 | CUDA_CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int))); 87 | atomics<<>>(d_shared_var, d_values_read_atomic, N, iters); 88 | CUDA_CHECK(cudaDeviceSynchronize()); 89 | GET_TIME(stop) 90 | atomic_mean_time += (stop - start); 91 | CUDA_CHECK(cudaMemcpy(&h_shared_var_atomic, d_shared_var, sizeof(int), cudaMemcpyDeviceToHost)); 92 | 93 | GET_TIME(start); 94 | CUDA_CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int))); 95 | unsafe<<>>(d_shared_var, d_values_read_unsafe, N, iters); 96 | CUDA_CHECK(cudaDeviceSynchronize()); 97 | GET_TIME(stop); 98 | unsafe_mean_time += stop - start; 99 | CUDA_CHECK(cudaMemcpy(&h_shared_var_unsafe, d_shared_var, sizeof(int), cudaMemcpyDeviceToHost)); 100 | } 101 | 102 | printf("In total, %d runs using atomic operations took %f s\n", runs, atomic_mean_time); 103 | printf(" Using atomic operations also produced an output of %d\n", h_shared_var_atomic); 104 | printf("In total, %d runs using unsafe operations took %f s\n", runs, unsafe_mean_time); 105 | printf(" Using unsafe operations also produced an output of %d\n", h_shared_var_unsafe); 106 | 107 | print_read_results(h_values_read, d_values_read_atomic, N, "atomic"); 108 | print_read_results(h_values_read, d_values_read_unsafe, N, "unsafe"); 109 | 110 | return 0; 111 | } -------------------------------------------------------------------------------- /CUDA/Instruction/floating-point-accuracy.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: floating-point-accuracy.cu 3 | * Description: This is an example to demonstrate floating-point's inability to 4 | * represent certain values with a specific value as an example. 5 | * 6 | * In this example, the value 12.1 is stored in single- and 7 | * double-precision floating-point variables on both the host and 8 | * device. After retrieving the results from the device, the actual 9 | * values stored are printed to 20 decimal places and the single- and 10 | * double-precision results from the host and device are compared to 11 | * each other to verify that host and device are equally accurate for 12 | * the same type. 13 | * 14 | * Compile: nvcc -o floating-point-accuracy floating-point-accuracy.cu -I.. 15 | * Run: ./floating-point-accuracy 16 | *****************************************************************************/ 17 | #include 18 | #include 19 | #include 20 | #include "common/common.h" 21 | 22 | __global__ 23 | void kernel(float* f, double* d) 24 | { 25 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 26 | 27 | if (tid == 0) { 28 | *f = 12.1; 29 | *d = 12.1; 30 | } 31 | } 32 | 33 | int main(int argc, char **argv) 34 | { 35 | float *deviceF; 36 | float h_deviceF; 37 | double *deviceD; 38 | double h_deviceD; 39 | 40 | float hostF = 12.1; 41 | double hostD = 12.1; 42 | 43 | CUDA_CHECK(cudaMalloc((void **)&deviceF, sizeof(float))); 44 | CUDA_CHECK(cudaMalloc((void **)&deviceD, sizeof(double))); 45 | kernel<<<1, 32>>>(deviceF, deviceD); 46 | CUDA_CHECK(cudaMemcpy(&h_deviceF, deviceF, sizeof(float), 47 | cudaMemcpyDeviceToHost)); 48 | CUDA_CHECK(cudaMemcpy(&h_deviceD, deviceD, sizeof(double), 49 | cudaMemcpyDeviceToHost)); 50 | 51 | printf("Host single-precision representation of 12.1 = %.20f\n", hostF); 52 | printf("Host double-precision representation of 12.1 = %.20f\n", hostD); 53 | printf("Device single-precision representation of 12.1 = %.20f\n", hostF); 54 | printf("Device double-precision representation of 12.1 = %.20f\n", hostD); 55 | printf("Device and host single-precision representation equal? %s\n", 56 | hostF == h_deviceF ? "yes" : "no"); 57 | printf("Device and host double-precision representation equal? %s\n", 58 | hostD == h_deviceD ? "yes" : "no"); 59 | 60 | return 0; 61 | } -------------------------------------------------------------------------------- /CUDA/Instruction/fmad.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: fmad.cu 3 | * Description: This is an example to illustrates the effect on numerical accuracy 4 | * of fusing a multiply-add into a sing MAD instruction. 5 | * 6 | * Compile: nvcc -o fmad fmad.cu -I.. [--fmad=true or false] 7 | * Run: ./fmad 8 | *****************************************************************************/ 9 | #include 10 | #include 11 | #include 12 | #include "common/common.h" 13 | 14 | __global__ 15 | void fmad_kernel(double x, double y, double *out) 16 | { 17 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 18 | 19 | if (tid == 0) { 20 | *out = x * x + y; 21 | } 22 | } 23 | 24 | double host_fmad_kernel(double x, double y) 25 | { 26 | return x * x + y; 27 | } 28 | 29 | int main(int argc, char** argv) 30 | { 31 | double *d_out, h_out; 32 | double x = 2.891903; 33 | double y = -3.980364; 34 | 35 | double host_value = host_fmad_kernel(x, y); 36 | 37 | CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double))); 38 | fmad_kernel<<<1, 32>>>(x, y, d_out); 39 | CUDA_CHECK(cudaMemcpy(&h_out, d_out, sizeof(double), cudaMemcpyDeviceToHost)); 40 | 41 | if (host_value == h_out) { 42 | printf("The device output the same value as the host.\n"); 43 | } 44 | else { 45 | printf("The device output a different value than the host, diff=%e.\n", fabs(host_value - h_out)); 46 | } 47 | 48 | return 0; 49 | } -------------------------------------------------------------------------------- /CUDA/Instruction/intrinsic-standard-comp.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: intrinsic-standard-comp.cu 3 | * Description: This is an example to demonstrate the relative performance and 4 | * accuracy of CUDA standard and intrinsic functions. 5 | * 6 | * The computational kernel of this example is the iterative 7 | * calculation of a value squared. This computation is done on the 8 | * host, on the device with a standard function. The results from 9 | * all three are compared for numerical accuarcy (with the host as 10 | * the baseline), and the performance of standard and intrinsic 11 | * function is also compared. 12 | * 13 | * Compile: nvcc -o intrinsic-standard-comp intrinsic-standard-comp.cu -I.. 14 | * Run: ./intrinsic-standard-comp 15 | *****************************************************************************/ 16 | #include 17 | #include 18 | #include 19 | #include "common/common.h" 20 | 21 | /* Perform iters power operations using the standard powf function. */ 22 | __global__ 23 | void standard_kernel(float a, float *out, int iters) 24 | { 25 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 26 | 27 | if (tid == 0) { 28 | float tmp; 29 | 30 | for (int i = 0; i < iters; i++) 31 | tmp = powf(a, 2.0f); 32 | 33 | *out = tmp; 34 | } 35 | } 36 | 37 | /* Perform iters power operations using the intrinsic __powf function. */ 38 | __global__ 39 | void intrinsic_kernel(float a, float *out, int iters) 40 | { 41 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 42 | 43 | if (tid == 0) { 44 | float tmp; 45 | 46 | for (int i = 0; i < iters; i++) 47 | tmp = __powf(a, 2.0f); 48 | 49 | *out = tmp; 50 | } 51 | } 52 | 53 | int main(int argc, char** argv) 54 | { 55 | int runs = 30; 56 | int iters = 1000; 57 | 58 | float *d_standard_out, h_standard_out; 59 | CUDA_CHECK(cudaMalloc((void**)&d_standard_out, sizeof(float))); 60 | 61 | float *d_intrinsic_out, h_intrinsic_out; 62 | CUDA_CHECK(cudaMalloc((void**)&d_intrinsic_out, sizeof(float))); 63 | 64 | float input_value = 8181.25; 65 | 66 | double mean_standard_time = 0.0; 67 | double mean_intrinsic_time = 0.0; 68 | 69 | for (int i = 0; i < runs; i++) { 70 | double start, stop; 71 | 72 | GET_TIME(start); 73 | standard_kernel<<<1, 32>>>(input_value, d_standard_out, iters); 74 | CUDA_CHECK(cudaDeviceSynchronize()); 75 | GET_TIME(stop); 76 | mean_standard_time += stop - start; 77 | 78 | GET_TIME(start); 79 | intrinsic_kernel<<<1, 32>>>(input_value, d_intrinsic_out, iters); 80 | CUDA_CHECK(cudaDeviceSynchronize()); 81 | GET_TIME(stop); 82 | mean_intrinsic_time += stop - start; 83 | } 84 | 85 | CUDA_CHECK(cudaMemcpy(&h_standard_out, d_standard_out, sizeof(float), cudaMemcpyDeviceToHost)); 86 | CUDA_CHECK(cudaMemcpy(&h_intrinsic_out, d_intrinsic_out, sizeof(float), cudaMemcpyDeviceToHost)); 87 | float host_value = powf(input_value, 2.0f); 88 | 89 | mean_standard_time /= runs; 90 | mean_intrinsic_time /= runs; 91 | 92 | printf("Host calculated\t\t\t%f\n", host_value); 93 | printf("Standard Device calculated\t%f\n", h_standard_out); 94 | printf("Intrinsic Device calculated\t%f\n", h_intrinsic_out); 95 | printf("Host equals Standard?\t\t%s, diff=%e\n", 96 | host_value == h_standard_out ? "Yes" : "No", 97 | fabs(host_value - h_standard_out)); 98 | printf("Host equals Intrinsic?\t\t%s, diff=%e\n", 99 | host_value == h_intrinsic_out ? "Yes" : "No", 100 | fabs(host_value - h_intrinsic_out)); 101 | printf("Standard equals Intrinsic?\t%s, diff=%e\n", 102 | h_standard_out == h_intrinsic_out ? "Yes" : "No", 103 | fabs(h_standard_out - h_intrinsic_out)); 104 | printf("\n"); 105 | printf("Mean execution time for standard function powf: %f ms\n", 106 | mean_standard_time * 1000.f); 107 | printf("Mean execution time for intrinsic function __powf: %f ms\n", 108 | mean_intrinsic_time * 1000.f); 109 | 110 | return 0; 111 | } -------------------------------------------------------------------------------- /CUDA/Instruction/my-atomic-add.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: my-atomic-add.cu 3 | * Description: This is an example to illustrates implementation of custom atomic 4 | * operations using CUDA's build-in atomicCAS function to implement 5 | * atomic signed 32-bit integer addition 6 | * 7 | * Compile: nvcc -o my-atomic-add my-atomic-add.cu -I.. 8 | * Run: ./my-atomic-add 9 | *****************************************************************************/ 10 | #include 11 | #include 12 | #include 13 | #include "common/common.h" 14 | 15 | __device__ 16 | int myAtomicAdd(int* address, int incr) 17 | { 18 | // Create an initial guess for the value stored at *address 19 | int guess = *address; 20 | int oldValue = atomicCAS(address, guess, guess + incr); 21 | 22 | // Loop while the quess is incorrect 23 | while (oldValue != guess) { 24 | guess = oldValue; 25 | oldValue = atomicCAS(address, guess, guess + incr); 26 | } 27 | 28 | return oldValue; 29 | } 30 | 31 | __global__ 32 | void kernel(int *sharedInteger) 33 | { 34 | myAtomicAdd(sharedInteger, 1); 35 | } 36 | 37 | int main(int argc, char **argv) 38 | { 39 | int h_sharedInteger; 40 | int *d_sharedInteger; 41 | CUDA_CHECK(cudaMalloc((void **)&d_sharedInteger, sizeof(int))); 42 | CUDA_CHECK(cudaMemset(d_sharedInteger, 0x00, sizeof(int))); 43 | 44 | kernel<<<4, 128>>>(d_sharedInteger); 45 | 46 | CUDA_CHECK(cudaMemcpy(&h_sharedInteger, d_sharedInteger, sizeof(int), cudaMemcpyDeviceToHost)); 47 | printf("4 x 128 increments led to value of %d\n", h_sharedInteger); 48 | 49 | return 0; 50 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/01_hello-gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void helloCPU() 4 | { 5 | printf("Hello from the CPU.\n"); 6 | } 7 | 8 | __global__ void helloGPU() 9 | { 10 | printf("Hello from the GPU.\n"); 11 | } 12 | 13 | int main() 14 | { 15 | helloCPU(); 16 | 17 | helloGPU<<<1, 1>>>(); 18 | cudaDeviceSynchronize(); 19 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/02_first-parallel.cu: -------------------------------------------------------------------------------- 1 | // 02_first-parallel.cu 2 | #include 3 | 4 | __global__ 5 | void firstParallel() 6 | { 7 | printf("This is running in parallel.\n"); 8 | } 9 | 10 | int main() 11 | { 12 | firstParallel<<<5, 5>>>(); 13 | cudaDeviceSynchronize(); 14 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/03_thread-and-block-idx.cu: -------------------------------------------------------------------------------- 1 | // 03_thread-and-block-idx.cu 2 | #include 3 | 4 | __global__ 5 | void printSuccessForCorrectExecutionConfiguration() 6 | { 7 | if (threadIdx.x == 1023 && blockIdx.x == 255) { 8 | printf("Success.\n"); 9 | } 10 | } 11 | 12 | int main() 13 | { 14 | printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>(); 15 | cudaDeviceSynchronize(); 16 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/04_single-block-loop.cu: -------------------------------------------------------------------------------- 1 | // 04_single-block-loop 2 | #include 3 | 4 | __global__ 5 | void loop() 6 | { 7 | printf("This is iteration number %d\n", threadIdx.x); 8 | } 9 | 10 | int main() 11 | { 12 | loop<<<1, 10>>>(); 13 | cudaDeviceSynchronize(); 14 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/05_multiple-block-loop.cu: -------------------------------------------------------------------------------- 1 | // 05_multiple-block-loop 2 | #include 3 | 4 | __global__ 5 | void loop() 6 | { 7 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 8 | printf("This is iteration number %d\n", idx); 9 | } 10 | 11 | int main() 12 | { 13 | loop<<<2, 5>>>(); 14 | cudaDeviceSynchronize(); 15 | } 16 | 17 | // CPU-only 18 | 19 | int N = 2<<20; 20 | size_t size = N * sizeof(int); 21 | 22 | int *a; 23 | a = (int *)malloc(size); 24 | 25 | // Use `a` in CPU-only program. 26 | 27 | free(a); 28 | // Accelerated 29 | 30 | int N = 2<<20; 31 | size_t size = N * sizeof(int); 32 | 33 | int *a; 34 | // Note the address of `a` is passed as first argument. 35 | cudaMallocManaged(&a, size); 36 | 37 | // Use `a` on the CPU and/or on any GPU in the accelerated system. 38 | 39 | cudaFree(a); -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/06_double-elements.cu: -------------------------------------------------------------------------------- 1 | // 06_double-elements.cu 2 | #include 3 | 4 | void init(int *a, const int N) 5 | { 6 | for (int i = 0; i < N; i++) { 7 | a[i] = i; 8 | } 9 | } 10 | 11 | __global__ 12 | void doubleElements(int *a, const int N) 13 | { 14 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 15 | if (idx < N) { 16 | a[i] *= 2; 17 | } 18 | } 19 | 20 | bool checkElementsAreDoubled(int *a, const int N) 21 | { 22 | for (int i = 0; i < N; i++) { 23 | if (a[i] != i * 2) 24 | return false; 25 | } 26 | 27 | return true; 28 | } 29 | 30 | int main() 31 | { 32 | int N = 1000; 33 | int *a; 34 | 35 | size_t size = N * sizeof(int); 36 | 37 | // Use 'cudaMallocManaged' to allocate pointer 'a' available 38 | // on both the host and the device. 39 | cudamallocManaged(&a, size); 40 | 41 | init(a, N); 42 | 43 | size_t threads_per_block = 256; 44 | size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block; 45 | 46 | doubleElements<<>>(a, N); 47 | cudaDeviceSynchronize(); 48 | 49 | bool areDoubled = checkElementsAreDoubled(a, N); 50 | printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE"); 51 | 52 | // Use 'cudaFree' to free memory allocated with 'cudaMallocManaged' 53 | cudaFree(a); 54 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/07_grid-stride-double.cu: -------------------------------------------------------------------------------- 1 | // 07_grid-stride-double.cu 2 | #include 3 | 4 | void init(int *a, const int N) 5 | { 6 | for (int i = 0; i < N; i++) { 7 | a[i] = i; 8 | } 9 | } 10 | 11 | __global__ 12 | void doubleElements(int *a, const int N) 13 | { 14 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 15 | int stride = gridDim.x * blockDim.x; 16 | for (int i = idx; i < N; i += stride) { 17 | a[i] *= 2; 18 | } 19 | } 20 | 21 | bool checkElementsAreDoubled(int *a, const int N) 22 | { 23 | for (int i = 0; i < N; i++) { 24 | if (a[i] != i * 2) 25 | return false; 26 | } 27 | 28 | return true; 29 | } 30 | 31 | int main() 32 | { 33 | int N = 1000; 34 | int *a; 35 | 36 | size_t size = N * sizeof(int); 37 | 38 | // Use 'cudaMallocManaged' to allocate pointer 'a' available 39 | // on both the host and the device. 40 | cudaMallocManaged(&a, size); 41 | 42 | init(a, N); 43 | 44 | size_t threads_per_block = 256; 45 | size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block; 46 | 47 | doubleElements<<>>(a, N); 48 | cudaDeviceSynchronize(); 49 | 50 | bool areDoubled = checkElementsAreDoubled(a, N); 51 | printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE"); 52 | 53 | // Use 'cudaFree' to free memory allocated with 'cudaMallocManaged' 54 | cudaFree(a); 55 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/08_add-error-handling.cu: -------------------------------------------------------------------------------- 1 | // 08_add-error-handling.cu 2 | #include 3 | 4 | void init(int *a, const int N) 5 | { 6 | for (int i = 0; i < N; i++) { 7 | a[i] = i; 8 | } 9 | } 10 | 11 | __global__ 12 | void doubleElements(int *a, const int N) 13 | { 14 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 15 | int stride = gridDim.x * blockDim.x; 16 | for (int i = idx; i < N + stride; i += stride) { 17 | a[i] *= 2; 18 | } 19 | } 20 | 21 | bool checkElementsAreDoubled(int *a, const int N) 22 | { 23 | for (int i = 0; i < N; i++) { 24 | if (a[i] != i * 2) 25 | return false; 26 | } 27 | 28 | return true; 29 | } 30 | 31 | int main() 32 | { 33 | int N = 1000; 34 | int *a; 35 | 36 | size_t size = N * sizeof(int); 37 | 38 | // Use 'cudaMallocManaged' to allocate pointer 'a' available 39 | // on both the host and the device. 40 | cudaMallocManaged(&a, size); 41 | 42 | init(a, N); 43 | 44 | size_t threads_per_block = 1024; 45 | size_t number_of_blocks = 32; 46 | 47 | cudaError_t syncErr, asyncErr; 48 | 49 | doubleElements<<>>(a, N); 50 | 51 | // catch errors for both the kernel launch above and any errors that 52 | // occur during the asynchronous 'doubleElements' kernel execution. 53 | syncErr = cudaGetLastError(); 54 | asyncErr = cudaDeviceSynchronize(); 55 | 56 | // print errors should they exist. 57 | if (syncErr != cudaSuccess) 58 | printf("Error(sync): %s\n", cudaGetErrorString(syncErr)); 59 | if (asyncErr != cudaSuccess) 60 | printf("Error(async): %s\n", cudaGetErrorString(asyncErr)); 61 | 62 | bool areDoubled = checkElementsAreDoubled(a, N); 63 | printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE"); 64 | 65 | // Use 'cudaFree' to free memory allocated with 'cudaMallocManaged' 66 | cudaFree(a); 67 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/09_vector-add.cu: -------------------------------------------------------------------------------- 1 | // 09_vector-add.cu 2 | #include 3 | #include 4 | 5 | inline cudaError_t checkCuda(cudaError_t result) 6 | { 7 | if (result != cudaSuccess) { 8 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); 9 | assert(result == cudaSuccess); 10 | } 11 | return result; 12 | } 13 | 14 | void initWith(float num, float* a, const int N) 15 | { 16 | for (int i = 0; i < N; i++) { 17 | a[i] = num; 18 | } 19 | } 20 | 21 | __global__ 22 | void addVectorsInto(float* result, float* a, float* b, const int N) 23 | { 24 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 25 | int stride = blockDim.x * gridDim.x; 26 | 27 | for (int i = idx; i < N; i += stride) { 28 | result[i] = a[i] + b[i]; 29 | } 30 | } 31 | 32 | void checkElementsAre(float target, float* array, const int N) 33 | { 34 | for (int i = 0; i < N; i++) { 35 | if (array[i] != target) { 36 | printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target); 37 | exit(1); 38 | } 39 | } 40 | printf("SUCCESS! All values added correctly.\n"); 41 | } 42 | 43 | int main() 44 | { 45 | const int N = 2 << 20; 46 | size_t size = N * sizeof(float); 47 | 48 | float *a, *b, *c; 49 | 50 | checkCuda(cudaMallocManaged(&a, size)); 51 | checkCuda(cudaMallocManaged(&b, size)); 52 | checkCuda(cudaMallocManaged(&c, size)); 53 | 54 | initWith(3, a, N); 55 | initWith(4, b, N); 56 | initWith(0, c, N); 57 | 58 | size_t threadsPerBlock = 1024; 59 | size_t numberOfBlocks = 1; 60 | 61 | addVectorsInto<<>>(c, a, b, N); 62 | 63 | checkCuda(cudaGetLastError()); 64 | checkCuda(cudaDeviceSynchronize()); 65 | 66 | checkElementsAre(7, c, N); 67 | 68 | checkCuda(cudaFree(a)); 69 | checkCuda(cudaFree(b)); 70 | checkCuda(cudaFree(c)); 71 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/10_matrix-multiply-2d.cu: -------------------------------------------------------------------------------- 1 | // 10_matrix-multiply-2d.cu 2 | #include 3 | 4 | #define N 64 5 | 6 | __global__ 7 | void matrixMulGPU(int* a, int* b, int* c) 8 | { 9 | int val = 0; 10 | 11 | int row = blockIdx.x * blockDim.x + threadIdx.x; 12 | int col = blockIdx.y * blockDim.y + threadIdx.y; 13 | 14 | if (row < N && col < N) { 15 | for (int k = 0; k < N; k++) { 16 | val += a[row * N + k] * b[k * N + col]; 17 | } 18 | c[row * N + col] = val; 19 | } 20 | } 21 | 22 | void matrixMulCPU(int* a, int* b, int* c) 23 | { 24 | int val = 0; 25 | 26 | for (int row = 0; row < N; row++) { 27 | for (int col = 0; col < N; col++) { 28 | val = 0; 29 | for (int k = 0; k < N; k++) { 30 | val += a[row * N + k] * b[k * N + col]; 31 | } 32 | c[row * N + col] = val; 33 | } 34 | } 35 | } 36 | 37 | int main() 38 | { 39 | int *a, *b, *c_cpu, *c_gpu; 40 | 41 | size_t size = N * N * sizeof(int); // The number of bytes of an N x N matrix 42 | 43 | // Allocate Memory 44 | cudaMallocManaged(&a, size); 45 | cudaMallocManaged(&b, size); 46 | cudaMallocManaged(&c_cpu, size); 47 | cudaMallocManaged(&c_gpu, size); 48 | 49 | // Initialize Memory 50 | for (int row = 0; row < N; row++) { 51 | for (int col = 0; col < N; col++) { 52 | a[row * N + col] = row; 53 | b[row * N + col] = col + 2; 54 | c_cpu[row * N + col] = 0; 55 | c_gpu[row * N + col] = 0; 56 | } 57 | } 58 | 59 | // configuration 60 | dim3 threads_per_block(16, 16, 1); // A 16 x 16 block threads 61 | dim3 number_of_blocks((N / threads_per_block.x) + 1, (N / threads_per_block.y) + 1, 1); 62 | 63 | matrixMulGPU<<>>(a, b, c_gpu); 64 | 65 | cudaDeviceSynchronize(); 66 | 67 | // Call the CPU version to check 68 | matrixMulCPU(a, b, c_cpu); 69 | 70 | // Compare the two answers 71 | bool error = false; 72 | for (int row = 0; row < N; row++) { 73 | for (int col = 0; col < N; col++) { 74 | if (c_cpu[row * N + col] != c_gpu[row * N + col]) { 75 | printf("FOUND ERROR at c[%d][%d]\n", row, col); 76 | error = true; 77 | break; 78 | } 79 | } 80 | } 81 | 82 | if (!error) { 83 | printf("Success!\n"); 84 | } 85 | 86 | // Free all allocated memory 87 | cudaFree(a); 88 | cudaFree(b); 89 | cudaFree(c_cpu); 90 | cudaFree(c_gpu); 91 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/11_get-device-properties.cu: -------------------------------------------------------------------------------- 1 | // 11_get-device-properties 2 | #include 3 | 4 | int main() 5 | { 6 | /* 7 | * Device ID is required first to query the device. 8 | */ 9 | 10 | int deviceId; 11 | cudaGetDevice(&deviceId); 12 | 13 | cudaDeviceProp props; 14 | cudaGetDeviceProperties(&props, deviceId); 15 | 16 | /* 17 | * `props` now contains several properties about the current device. 18 | */ 19 | 20 | int computeCapabilityMajor = props.major; 21 | int computeCapabilityMinor = props.minor; 22 | int multiProcessorCount = props.multiProcessorCount; 23 | int warpSize = props.warpSize; 24 | 25 | printf("Device ID: %d\nNumber of SMs: %d\nCompute Capability Major: %d\nCompute Capability Minor: %d\nWarp Size: %d\n", deviceId, multiProcessorCount, computeCapabilityMajor, computeCapabilityMinor, warpSize); 26 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/12_page-faults.cu: -------------------------------------------------------------------------------- 1 | // 12_page-faults.cu 2 | 3 | __global__ 4 | void deviceKernel(int *a, const int N) 5 | { 6 | int idx = blockIdx.x + blockIdx.x * blockDim.x; 7 | int stride = blockDim.x * gridDim.x; 8 | 9 | for (int i = idx; i < N; i += stride) { 10 | a[i] = i; 11 | } 12 | } 13 | 14 | void hostFunction(int *a, const int N) 15 | { 16 | for (int i = 0; i < N; i++) { 17 | a[i] = i; 18 | } 19 | } 20 | 21 | int main() 22 | { 23 | int N = 2 << 24; 24 | size_t size = N * sizeof(int); 25 | 26 | int *a; 27 | cudaMallocManaged(&a, size); 28 | 29 | /* 30 | * Conduct experiments to learn more about the behavior of 31 | * `cudaMallocManaged`. 32 | * 33 | * What happens when unified memory is accessed only by the GPU? 34 | * deviceKernel(a, N); 35 | * cudaDeviceSynchronize(); 36 | * What happens when unified memory is accessed only by the CPU? 37 | * hostFunction<<<256, 256>>>(a, N); 38 | * cudaDeviceSynchronize(); 39 | * What happens when unified memory is accessed first by the GPU then the CPU? 40 | * deviceKernel<<<256, 256>>>(a, N) 41 | * cudaDeviceSynchronize(); 42 | * hostFunction(a, N); 43 | * What happens when unified memory is accessed first by the CPU then the GPU? 44 | * hostFunction(a, N); 45 | * deviceKernel<<<256, 256>>>(a, N); 46 | * cudaDeviceSynchronize(); 47 | * 48 | * Hypothesize about UM behavior, page faulting specificially, before each 49 | * experiment, and then verify by running `nsys`. 50 | */ 51 | 52 | 53 | cudaFree(a); 54 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/13_print-numbers.cu: -------------------------------------------------------------------------------- 1 | // 13_print-numbers.cu 2 | #include 3 | 4 | __global__ 5 | void printNumber(int number) 6 | { 7 | printf("%d\n", number); 8 | } 9 | 10 | int main() 11 | { 12 | for (int i = 0; i < 5; i++) { 13 | cudaStream_t stream; 14 | cudaStreamCreate(&stream); 15 | printNumber<<<1, 1, 0, stream>>>(i); 16 | cudaStreamDestroy(stream); 17 | } 18 | 19 | cudaDeviceSynchronize(); 20 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/14_n-body.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | #include "files.h" 6 | 7 | #define SOFTENING 1e-9f 8 | 9 | /* 10 | * Each body contains x, y, and z coordinate positions, 11 | * as well as velocities in the x, y, and z directions. 12 | */ 13 | 14 | typedef struct { float x, y, z, vx, vy, vz; } Body; 15 | 16 | /* 17 | * Calculate the gravitational impact of all bodies in the system 18 | * on all others. 19 | */ 20 | 21 | __global__ 22 | void bodyForce(Body *p, float dt, int n) { 23 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 24 | int stride = blockDim.x * gridDim.x; 25 | 26 | for (int i = idx; i < n; i += stride) { 27 | float Fx = 0.f, Fy = 0.f, Fz = 0.f; 28 | 29 | for (int j = 0; j < n; j++) { 30 | float dx = p[j].x - p[i].x; 31 | float dy = p[j].y - p[i].y; 32 | float dz = p[j].z - p[i].z; 33 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 34 | float invDist = rsqrtf(distSqr); 35 | float invDist3 = invDist * invDist * invDist; 36 | 37 | Fx += dx * invDist3; 38 | Fy += dy * invDist3; 39 | Fz += dz * invDist3; 40 | } 41 | 42 | p[i].vx += dt * Fx; 43 | p[i].vy += dt * Fy; 44 | p[i].vz += dt * Fz; 45 | } 46 | } 47 | 48 | __global__ 49 | void intergratePosition(Body *p, float dt, int n) 50 | { 51 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 52 | int stride = blockDim.x * gridDim.x; 53 | 54 | for (int i = idx ; i < n; i += stride) 55 | { 56 | p[i].x += p[i].vx * dt; 57 | p[i].y += p[i].vy * dt; 58 | p[i].z += p[i].vz * dt; 59 | } 60 | } 61 | 62 | int main(const int argc, const char **argv) 63 | { 64 | int nBodies = 2 << 11; 65 | if (argc > 1) 66 | nBodies = 2 << atoi(argv[1]); 67 | 68 | int deviceId; 69 | checkCuda(cudaGetDevice(&deviceId)); 70 | cudaDeviceProp props; 71 | checkCuda(cudaGetDeviceProperties(&props, deviceId)); 72 | 73 | size_t threadsPerBlock = props.maxThreadsPerBlock; 74 | size_t numberOfBlocks = props.multiProcessorCount; 75 | 76 | // The assessment will pass hidden initialized values to check for correctness. 77 | // You should not make changes to these files, or else the assessment will not work. 78 | const char *initialized_values; 79 | const char *solution_values; 80 | 81 | if (nBodies == 2 << 11) 82 | { 83 | initialized_values = "09-nbody/files/initialized_4096"; 84 | solution_values = "09-nbody/files/solution_4096"; 85 | } 86 | else 87 | { // nBodies == 2<<15 88 | initialized_values = "09-nbody/files/initialized_65536"; 89 | solution_values = "09-nbody/files/solution_65536"; 90 | } 91 | 92 | if (argc > 2) 93 | initialized_values = argv[2]; 94 | if (argc > 3) 95 | solution_values = argv[3]; 96 | 97 | const float dt = 0.01f; // Time step 98 | const int nIters = 10; // Simulation iterations 99 | 100 | int bytes = nBodies * sizeof(Body); 101 | float *buf; 102 | 103 | cudaMallocManaged(&buf, bytes); 104 | 105 | Body *p = (Body *)buf; 106 | 107 | cudaMemPrefetchAsync(buf, bytes, cudaCpuDeviceId); 108 | read_values_from_file(initialized_values, buf, bytes); 109 | 110 | double totalTime = 0.0; 111 | 112 | /* 113 | * This simulation will run for 10 cycles of time, calculating gravitational 114 | * interaction amongst bodies, and adjusting their positions to reflect. 115 | */ 116 | cudaMemPrefetchAsync(buf, bytes, device_id); 117 | 118 | for (int iter = 0; iter < nIters; iter++) 119 | { 120 | StartTimer(); 121 | 122 | bodyForce<<>>(p, dt, nBodies); // compute interbody forces 123 | intergratePosition<<>>(p, dt, nBodies); 124 | 125 | cudaDeviceSynchronize(); 126 | 127 | const double tElapsed = GetTimer() / 1000.0; 128 | totalTime += tElapsed; 129 | } 130 | 131 | double avgTime = totalTime / (double)(nIters); 132 | float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime; 133 | 134 | cudaMemPrefetchAsync(buf, bytes, cudaCpuDeviceId); 135 | write_values_to_file(solution_values, buf, bytes); 136 | 137 | // You will likely enjoy watching this value grow as you accelerate the application, 138 | // but beware that a failure to correctly synchronize the device might result in 139 | // unrealistically high values. 140 | printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond); 141 | 142 | cudaFree(buf); 143 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/15_vector-add-manual-alloc.cu: -------------------------------------------------------------------------------- 1 | // 15_vector-add-manual-alloc.cu 2 | #include 3 | #include 4 | 5 | inline cudaError_t checkCuda(cudaError_t result) 6 | { 7 | if (result != cudaSuccess) { 8 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); 9 | assert(result == cudaSuccess); 10 | } 11 | return result; 12 | } 13 | 14 | __global__ 15 | void initWith(float num, float *a, int N) 16 | { 17 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 18 | int stride = blockDim.x * gridDim.x; 19 | 20 | for (int i = idx; i < N; i += stride) 21 | { 22 | a[i] = num; 23 | } 24 | } 25 | 26 | __global__ 27 | void addVectorsInto(float* result, float* a, float* b, const int N) 28 | { 29 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 30 | int stride = blockDim.x * gridDim.x; 31 | 32 | for (int i = idx; i < N; i += stride) { 33 | result[i] = a[i] + b[i]; 34 | } 35 | } 36 | 37 | void checkElementsAre(float target, float* array, const int N) 38 | { 39 | for (int i = 0; i < N; i++) { 40 | if (array[i] != target) { 41 | printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target); 42 | exit(1); 43 | } 44 | } 45 | printf("SUCCESS! All values added correctly.\n"); 46 | } 47 | 48 | int main() 49 | { 50 | const int N = 2 << 20; 51 | size_t size = N * sizeof(float); 52 | 53 | int deviceId; 54 | checkCuda(cudaGetDevice(&deviceId)); 55 | 56 | cudaDeviceProp props; 57 | checkCuda(cudaGetDeviceProperties(&props, deviceId)); 58 | 59 | float *a, *b, *c, *h_c; 60 | 61 | checkCuda(cudaMalloc(&a, size)); 62 | checkCuda(cudaMalloc(&b, size)); 63 | checkCuda(cudaMalloc(&c, size)); 64 | checkCuda(cudaMallocHost(&h_c, size)); 65 | 66 | size_t threadsPerBlock = props.maxThreadsPerBlock; 67 | size_t numberOfBlocks = props.multiProcessorCount; 68 | 69 | cudaStream_t stream1, stream2, stream3; 70 | checkCuda(cudaStreamCreate(&stream1)); 71 | checkCuda(cudaStreamCreate(&stream2)); 72 | checkCuda(cudaStreamCreate(&stream3)); 73 | 74 | initWith<<>>(3, a, N); 75 | initWith<<>>(4, b, N); 76 | initWith<<>>(0, c, N); 77 | 78 | addVectorsInto<<>>(c, a, b, N); 79 | 80 | checkCuda(cudaMemcpy(h_c, c, size, cudaMemcpyDeviceToHost)); 81 | 82 | checkElementsAre(7, h_c, N); 83 | 84 | checkCuda(cudaStreamDestroy(stream1)); 85 | checkCuda(cudaStreamDestroy(stream2)); 86 | checkCuda(cudaStreamDestroy(stream3)); 87 | 88 | checkCuda(cudaFree(a)); 89 | checkCuda(cudaFree(b)); 90 | checkCuda(cudaFree(c)); 91 | checkCuda(cudaFreeHost(h_c)); 92 | } -------------------------------------------------------------------------------- /CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/16_vector-add-overlap-xfer.cu: -------------------------------------------------------------------------------- 1 | // 16_vector-add-overlap-xfer.cu 2 | #include 3 | #include 4 | 5 | inline cudaError_t checkCuda(cudaError_t result) 6 | { 7 | if (result != cudaSuccess) { 8 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); 9 | assert(result == cudaSuccess); 10 | } 11 | return result; 12 | } 13 | 14 | __global__ 15 | void initWith(float num, float *a, int N) 16 | { 17 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 18 | int stride = blockDim.x * gridDim.x; 19 | 20 | for (int i = idx; i < N; i += stride) 21 | { 22 | a[i] = num; 23 | } 24 | } 25 | 26 | __global__ 27 | void addVectorsInto(float* result, float* a, float* b, const int N) 28 | { 29 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 30 | int stride = blockDim.x * gridDim.x; 31 | 32 | for (int i = idx; i < N; i += stride) { 33 | result[i] = a[i] + b[i]; 34 | } 35 | } 36 | 37 | void checkElementsAre(float target, float* array, const int N) 38 | { 39 | for (int i = 0; i < N; i++) { 40 | if (array[i] != target) { 41 | printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target); 42 | exit(1); 43 | } 44 | } 45 | printf("SUCCESS! All values added correctly.\n"); 46 | } 47 | 48 | int main() 49 | { 50 | const int N = 2 << 20; 51 | size_t size = N * sizeof(float); 52 | 53 | int deviceId; 54 | checkCuda(cudaGetDevice(&deviceId)); 55 | 56 | cudaDeviceProp props; 57 | checkCuda(cudaGetDeviceProperties(&props, deviceId)); 58 | 59 | float *a, *b, *c, *h_c; 60 | 61 | checkCuda(cudaMalloc(&a, size)); 62 | checkCuda(cudaMalloc(&b, size)); 63 | checkCuda(cudaMalloc(&c, size)); 64 | checkCuda(cudaMallocHost(&h_c, size)); 65 | 66 | size_t threadsPerBlock = props.maxThreadsPerBlock; 67 | size_t numberOfBlocks = props.multiProcessorCount; 68 | 69 | cudaStream_t stream1, stream2, stream3; 70 | checkCuda(cudaStreamCreate(&stream1)); 71 | checkCuda(cudaStreamCreate(&stream2)); 72 | checkCuda(cudaStreamCreate(&stream3)); 73 | 74 | initWith<<>>(3, a, N); 75 | initWith<<>>(4, b, N); 76 | initWith<<>>(0, c, N); 77 | 78 | for (int i = 0; i < 4; ++i) { 79 | cudaStream_t stream; 80 | checkCuda(cudaStreamCreate(&stream)); 81 | 82 | addVectorsInto<<>>(&c[i * N / 4], &a[i * N / 4], &b[i * N / 4], N / 4); 83 | checkCuda(cudaMemcpyAsync(&h_c[i * N / 4], &c[i * N / 4], size / 4, cudaMemcpyDeviceToHost, stream)); 84 | checkCuda(cudaStreamDestroy(stream)); 85 | } 86 | checkCuda(cudaDeviceSynchronize()); 87 | 88 | checkElementsAre(7, h_c, N); 89 | 90 | checkCuda(cudaStreamDestroy(stream1)); 91 | checkCuda(cudaStreamDestroy(stream2)); 92 | checkCuda(cudaStreamDestroy(stream3)); 93 | 94 | checkCuda(cudaFree(a)); 95 | checkCuda(cudaFree(b)); 96 | checkCuda(cudaFree(c)); 97 | checkCuda(cudaFreeHost(h_c)); 98 | } -------------------------------------------------------------------------------- /CUDA/StreamsAndEvents/asyncAPI.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: asyncAPI.cu 3 | * Description: This is an example of using CUDA events to control asynchronous 4 | * work launched on the GPU. In this example, asynchronous copies 5 | * and an asynchronous kernel are used. A CUDA event is used to 6 | * determine when that work has completed. 7 | * 8 | * Compile: nvcc -o asyncAPI asyncAPI.cu -I.. 9 | * Run: ./asyncAPI 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | #include "common/common.h" 15 | 16 | __global__ 17 | void kernel(float* g_data, float value) 18 | { 19 | int idx = blockDim.x * blockIdx.x + threadIdx.x; 20 | g_data[idx] = g_data[idx] + value; 21 | } 22 | 23 | bool checkResult(float* data, const int N, const float x) 24 | { 25 | for (int i = 0; i < N; i++) { 26 | if (data[i] != x) { 27 | printf("Error! data[%d] = %f, ref = %f\n", i, data[i], x); 28 | return false; 29 | } 30 | } 31 | 32 | return true; 33 | } 34 | 35 | int main(int argc, char** argv) 36 | { 37 | int dev = 0; 38 | cudaDeviceProp deviceProp; 39 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 40 | printf("> Using device %d: %s\n", dev, deviceProp.name); 41 | CUDA_CHECK(cudaSetDevice(dev)); 42 | 43 | int num = 1 << 24; 44 | int nBytes = num * sizeof(float); 45 | float value = 10.0f; 46 | 47 | // allocate host memory 48 | float *h_a; 49 | CUDA_CHECK(cudaMallocHost((void**)&h_a, nBytes)); 50 | memset(h_a, 0, nBytes); 51 | 52 | // allocate device memory 53 | float *d_a; 54 | CUDA_CHECK(cudaMalloc((void**)&d_a, nBytes)); 55 | CUDA_CHECK(cudaMemset(d_a, 255, nBytes)); 56 | 57 | // set kernel launch configuration 58 | dim3 block = dim3(512); 59 | dim3 grid = dim3((num + block.x - 1) / block.x); 60 | 61 | // create cuda event handles 62 | cudaEvent_t stop; 63 | CUDA_CHECK(cudaEventCreate(&stop)); 64 | 65 | // asynchronously issue work to the GPU (all to stream 0) 66 | CUDA_CHECK(cudaMemcpyAsync(d_a, h_a, nBytes, cudaMemcpyHostToDevice)); 67 | kernel<<>>(d_a, value); 68 | CUDA_CHECK(cudaMemcpyAsync(h_a, d_a, nBytes, cudaMemcpyDeviceToHost)); 69 | CUDA_CHECK(cudaEventRecord(stop)); 70 | 71 | // have CPU do some work while waiting for stage 1 to finish 72 | unsigned long int counter = 0; 73 | while (cudaEventQuery(stop) == cudaErrorNotReady) 74 | counter++; 75 | 76 | // print the cpu and gpu times 77 | printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); 78 | 79 | // check the output for correctness 80 | bool results = checkResult(h_a, num, value); 81 | 82 | // release resources 83 | CUDA_CHECK(cudaEventDestroy(stop)); 84 | CUDA_CHECK(cudaFreeHost(h_a)); 85 | CUDA_CHECK(cudaFree(d_a)); 86 | CUDA_CHECK(cudaDeviceReset()); 87 | 88 | return 0; 89 | } -------------------------------------------------------------------------------- /CUDA/StreamsAndEvents/simpleCallback.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: simpleCallback.cu 3 | * Description: This is an example of using CUDA callbacks to trigger work on 4 | * the host after the completion of asynchronous work on the device. 5 | * In this example, NSTREAM CUDA streams are created and 4 kernels 6 | * are launched asynchronously in each. Then, a callback is added 7 | * at the completion of those asynchronous kernels that prints 8 | * prints diagnostic information. 9 | * 10 | * Compile: nvcc -o simpleCallback simpleCallback.cu -I.. 11 | * Run: ./simpleCallback 12 | *****************************************************************************/ 13 | #include 14 | #include 15 | #include 16 | #include "common/common.h" 17 | 18 | #define N 100000 19 | #define NSTREAM 4 20 | 21 | void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void* data) 22 | { 23 | printf("callback from stream %d\n", *((int*)data)); 24 | } 25 | 26 | __global__ void kernel_1() 27 | { 28 | double sum = 0.0; 29 | for (int i = 0; i < N; i++) { 30 | sum = sum + tan(0.1) * tan(0.1); 31 | } 32 | } 33 | 34 | __global__ void kernel_2() 35 | { 36 | double sum = 0.0; 37 | for (int i = 0; i < N; i++) { 38 | sum = sum + tan(0.1) * tan(0.1); 39 | } 40 | } 41 | 42 | __global__ void kernel_3() 43 | { 44 | double sum = 0.0; 45 | for (int i = 0; i < N; i++) { 46 | sum = sum + tan(0.1) * tan(0.1); 47 | } 48 | } 49 | 50 | __global__ void kernel_4() 51 | { 52 | double sum = 0.0; 53 | for (int i = 0; i < N; i++) { 54 | sum = sum + tan(0.1) * tan(0.1); 55 | } 56 | } 57 | 58 | int main(int argc, char** argv) 59 | { 60 | int dev = 0; 61 | cudaDeviceProp deviceProp; 62 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 63 | printf("> Using device %d: %s\n", dev, deviceProp.name); 64 | CUDA_CHECK(cudaSetDevice(dev)); 65 | 66 | // check if device support hyper-Q 67 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { 68 | if (deviceProp.concurrentKernels == 0) { 69 | printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n"); 70 | printf("> CUDA kernel runs will be serialized\n"); 71 | } 72 | else { 73 | printf("> GPU does not support HyperQ\n"); 74 | printf("> CUDA kernel runs will have limited concurrency\n"); 75 | } 76 | } 77 | 78 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", deviceProp.major, 79 | deviceProp.minor, deviceProp.multiProcessorCount); 80 | 81 | // set up max connection 82 | char* iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 83 | _putenv_s(iname, "8"); 84 | char* ivalue = getenv(iname); 85 | printf("> %s = %s\n", iname, ivalue); 86 | printf("> with streams = %d\n", NSTREAM); 87 | 88 | // allocate and initialize an array of stream handles 89 | cudaStream_t *streams = (cudaStream_t*)malloc(NSTREAM * sizeof(cudaStream_t)); 90 | for (int i = 0; i < NSTREAM; i++) { 91 | CUDA_CHECK(cudaStreamCreate(&streams[i])); 92 | } 93 | 94 | dim3 block(1); 95 | dim3 grid(1); 96 | cudaEvent_t start, stop; 97 | CUDA_CHECK(cudaEventCreate(&start)); 98 | CUDA_CHECK(cudaEventCreate(&stop)); 99 | 100 | int stream_ids[NSTREAM]; 101 | 102 | CUDA_CHECK(cudaEventRecord(start, 0)); 103 | for (int i = 0; i < NSTREAM; i++) { 104 | stream_ids[i] = i; 105 | kernel_1<<>>(); 106 | kernel_2<<>>(); 107 | kernel_3<<>>(); 108 | kernel_4<<>>(); 109 | CUDA_CHECK(cudaStreamAddCallback(streams[i], my_callback, (void*)(stream_ids + i), 0)); 110 | } 111 | CUDA_CHECK(cudaEventRecord(stop, 0)); 112 | CUDA_CHECK(cudaEventSynchronize(stop)); 113 | 114 | float elapsed_time; 115 | CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 116 | printf("Measured time for parallel execution = %.3fs\n", elapsed_time / 1000.f); 117 | 118 | // release all stream 119 | for (int i = 0; i < NSTREAM; i++) { 120 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 121 | } 122 | free(streams); 123 | 124 | CUDA_CHECK(cudaDeviceReset()); 125 | return 0; 126 | } -------------------------------------------------------------------------------- /CUDA/StreamsAndEvents/simpleHyperQBreadth.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: simpleHyperQBreadth.cu 3 | * Description: This is an example to demonstrates submitting work to a CUDA 4 | * Stream in breadth-first order prevents false-dependecies from 5 | * reducing the parallelism of an application. kernel_1, kernel_2, 6 | * kernel_3, and kernel_4 simply implement identical, dummy computation. 7 | * Seperate kernels are used to make the scheduling of these kernels 8 | * simpler to visualize in the Visual Profiler. 9 | * 10 | * Compile: nvcc -o simpleHyperQBreadth simpleHyperQBreadth.cu -I.. 11 | * Run: ./simpleHyperQBreadth 12 | *****************************************************************************/ 13 | #include 14 | #include 15 | #include 16 | #include "common/common.h" 17 | 18 | #define N 1000 19 | #define NSTREAM 4 20 | 21 | __global__ 22 | void kernel_1() 23 | { 24 | double sum = 0.0; 25 | 26 | for (int i = 0; i < N; i++) { 27 | sum = sum + tan(0.1) * tan(0.1); 28 | printf("%f\n", sum); 29 | } 30 | } 31 | 32 | __global__ 33 | void kernel_2() 34 | { 35 | double sum = 0.0; 36 | 37 | for (int i = 0; i < N; i++) { 38 | sum = sum + tan(0.1) * tan(0.1); 39 | printf("%f\n", sum); 40 | } 41 | } 42 | 43 | __global__ 44 | void kernel_3() 45 | { 46 | double sum = 0.0; 47 | 48 | for (int i = 0; i < N; i++) { 49 | sum = sum + tan(0.1) * tan(0.1); 50 | printf("%f\n", sum); 51 | } 52 | } 53 | 54 | __global__ 55 | void kernel_4() 56 | { 57 | double sum = 0.0; 58 | 59 | for (int i = 0; i < N; i++) { 60 | sum = sum + tan(0.1) * tan(0.1); 61 | printf("%f\n", sum); 62 | } 63 | } 64 | 65 | int main(int argc, char** argv) 66 | { 67 | int n_streams = NSTREAM; 68 | int isize = 1; 69 | int iblock = 1; 70 | int bigcase = 0; 71 | 72 | // get argument from command line 73 | if (argc > 1) 74 | n_streams = atoi(argv[1]); 75 | if (argc > 2) 76 | bigcase = atoi(argv[2]); 77 | 78 | float elapsed_time; 79 | 80 | // set up max connection 81 | char* iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 82 | _putenv_s(iname, "32"); 83 | char* ivalue = getenv(iname); 84 | printf("%s = %s\n", iname, ivalue); 85 | 86 | int dev = 0; 87 | cudaDeviceProp deviceProp; 88 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 89 | printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name, n_streams); 90 | CUDA_CHECK(cudaSetDevice(dev)); 91 | 92 | // check if device support hyper-Q 93 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { 94 | if (deviceProp.concurrentKernels == 0) { 95 | printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n"); 96 | printf("> CUDA kernel runs will be serialized\n"); 97 | } 98 | else { 99 | printf("> GPU does not support HyperQ\n"); 100 | printf("> CUDA kernel runs will have limited concurrency\n"); 101 | } 102 | } 103 | 104 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", deviceProp.major, 105 | deviceProp.minor, deviceProp.multiProcessorCount); 106 | 107 | // Allocate and initialize an array of stream handles 108 | cudaStream_t *streams = (cudaStream_t*)malloc(n_streams * sizeof(cudaStream_t)); 109 | 110 | for (int i = 0; i < n_streams; i++) { 111 | CUDA_CHECK(cudaStreamCreate(&(streams[i]))); 112 | } 113 | 114 | // run kernel with more threads 115 | if (bigcase == 1) { 116 | iblock = 512; 117 | isize = 1 << 12; 118 | } 119 | 120 | // setup execution configuration 121 | dim3 block(iblock); 122 | dim3 grid(isize / iblock); 123 | printf("> grid %d block %d\n", grid.x, block.x); 124 | 125 | // create events 126 | cudaEvent_t start, stop; 127 | CUDA_CHECK(cudaEventCreate(&start)); 128 | CUDA_CHECK(cudaEventCreate(&stop)); 129 | 130 | // record start event 131 | CUDA_CHECK(cudaEventRecord(start, 0)); 132 | 133 | // dispatch job with breadth first ordering 134 | for (int i = 0; i < n_streams; i++) 135 | kernel_1<<>>(); 136 | for (int i = 0; i < n_streams; i++) 137 | kernel_2<<>>(); 138 | for (int i = 0; i < n_streams; i++) 139 | kernel_3<<>>(); 140 | for (int i = 0; i < n_streams; i++) 141 | kernel_4<<>>(); 142 | 143 | // record stop event 144 | CUDA_CHECK(cudaEventRecord(stop, 0)); 145 | CUDA_CHECK(cudaEventSynchronize(stop)); 146 | 147 | // calculate elapsed time 148 | CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 149 | printf("Measured time for parallel execution = %fs\n", elapsed_time / 1000.f); 150 | 151 | // release all streams 152 | for (int i = 0; i < n_streams; i++) { 153 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 154 | } 155 | free(streams); 156 | 157 | // destory events 158 | CUDA_CHECK(cudaEventDestroy(start)); 159 | CUDA_CHECK(cudaEventDestroy(stop)); 160 | 161 | // reset device 162 | CUDA_CHECK(cudaDeviceReset()); 163 | 164 | return 0; 165 | } -------------------------------------------------------------------------------- /CUDA/StreamsAndEvents/simpleHyperQDepth.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: simpleHyperQDepth.cu 3 | * Description: This is an example to demonstrates submitting work to a CUDA 4 | * Stream in depth-first order. Work submission in depth-first order 5 | * may introduce false-dependencies between unrelated tasks in 6 | * different CUDA streams, limiting the parallelism of a CUDA application. 7 | * kernel_1, kernel_2, kernel_3, and kernel_4 simply implement 8 | * identical, dummy computation. Separate kernels are used to make 9 | * the scheduling of these kernels simpler to visualize in the Visual 10 | * Profiler. 11 | * 12 | * Compile: nvcc -o simpleHyperQDepth simpleHyperQDepth.cu -I.. 13 | * Run: ./simpleHyperQDepth 14 | *****************************************************************************/ 15 | #include 16 | #include 17 | #include 18 | #include "common/common.h" 19 | 20 | #define N 1000 21 | #define NSTREAM 4 22 | 23 | __global__ 24 | void kernel_1() 25 | { 26 | double sum = 0.0; 27 | 28 | for (int i = 0; i < N; i++) { 29 | sum = sum + tan(0.1) * tan(0.1); 30 | printf("%f\n", sum); 31 | } 32 | } 33 | 34 | __global__ 35 | void kernel_2() 36 | { 37 | double sum = 0.0; 38 | 39 | for (int i = 0; i < N; i++) { 40 | sum = sum + tan(0.1) * tan(0.1); 41 | printf("%f\n", sum); 42 | } 43 | } 44 | 45 | __global__ 46 | void kernel_3() 47 | { 48 | double sum = 0.0; 49 | 50 | for (int i = 0; i < N; i++) { 51 | sum = sum + tan(0.1) * tan(0.1); 52 | printf("%f\n", sum); 53 | } 54 | } 55 | 56 | __global__ 57 | void kernel_4() 58 | { 59 | double sum = 0.0; 60 | 61 | for (int i = 0; i < N; i++) { 62 | sum = sum + tan(0.1) * tan(0.1); 63 | printf("%f\n", sum); 64 | } 65 | } 66 | 67 | int main(int argc, char** argv) 68 | { 69 | int n_streams = NSTREAM; 70 | int isize = 1; 71 | int iblock = 1; 72 | int bigcase = 0; 73 | 74 | // get argument from command line 75 | if (argc > 1) 76 | n_streams = atoi(argv[1]); 77 | if (argc > 2) 78 | bigcase = atoi(argv[2]); 79 | 80 | float elapsed_time; 81 | 82 | // set up max connection 83 | char* iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 84 | _putenv_s(iname, "32"); 85 | char* ivalue = getenv(iname); 86 | printf("%s = %s\n", iname, ivalue); 87 | 88 | int dev = 0; 89 | cudaDeviceProp deviceProp; 90 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 91 | printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name, n_streams); 92 | CUDA_CHECK(cudaSetDevice(dev)); 93 | 94 | // check if device support hyper-Q 95 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { 96 | if (deviceProp.concurrentKernels == 0) { 97 | printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n"); 98 | printf("> CUDA kernel runs will be serialized\n"); 99 | } 100 | else { 101 | printf("> GPU does not support HyperQ\n"); 102 | printf("> CUDA kernel runs will have limited concurrency\n"); 103 | } 104 | } 105 | 106 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", deviceProp.major, 107 | deviceProp.minor, deviceProp.multiProcessorCount); 108 | 109 | // Allocate and initialize an array of stream handles 110 | cudaStream_t *streams = (cudaStream_t*)malloc(n_streams * sizeof(cudaStream_t)); 111 | 112 | for (int i = 0; i < n_streams; i++) { 113 | CUDA_CHECK(cudaStreamCreate(&(streams[i]))); 114 | } 115 | 116 | // run kernel with more threads 117 | if (bigcase == 1) { 118 | iblock = 512; 119 | isize = 1 << 12; 120 | } 121 | 122 | // setup execution configuration 123 | dim3 block(iblock); 124 | dim3 grid(isize / iblock); 125 | printf("> grid %d block %d\n", grid.x, block.x); 126 | 127 | // create events 128 | cudaEvent_t start, stop; 129 | CUDA_CHECK(cudaEventCreate(&start)); 130 | CUDA_CHECK(cudaEventCreate(&stop)); 131 | 132 | // record start event 133 | CUDA_CHECK(cudaEventRecord(start, 0)); 134 | 135 | // dispatch job with depth first ordering 136 | for (int i = 0;i < n_streams; i++) { 137 | kernel_1<<>>(); 138 | kernel_2<<>>(); 139 | kernel_3<<>>(); 140 | kernel_4<<>>(); 141 | } 142 | 143 | // record stop event 144 | CUDA_CHECK(cudaEventRecord(stop, 0)); 145 | CUDA_CHECK(cudaEventSynchronize(stop)); 146 | 147 | // calculate elapsed time 148 | CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 149 | printf("Measured time for parallel execution = %fs\n", elapsed_time / 1000.f); 150 | 151 | // release all streams 152 | for (int i = 0; i < n_streams; i++) { 153 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 154 | } 155 | free(streams); 156 | 157 | // destory events 158 | CUDA_CHECK(cudaEventDestroy(start)); 159 | CUDA_CHECK(cudaEventDestroy(stop)); 160 | 161 | // reset device 162 | CUDA_CHECK(cudaDeviceReset()); 163 | 164 | return 0; 165 | } -------------------------------------------------------------------------------- /CUDA/UnifiedMemory/matrixAddWithUnifiedMemory.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: matrixAddWithUnifiedMemory.cu 3 | * Description: This is an example to demonstrates the use of CUDA managed memory 4 | * to implement matrix addition. In this example, arbitrary pointers 5 | * can be dereferenced on the host and device. 6 | * CUDA will automatically manage the transfer of data to and from 7 | * the GPU as needed by the application. 8 | * 9 | * There is no need for the programmer to use cudaMemcpy, 10 | * cudaHostGetDevicePointer, or any other CUDA API involved with 11 | * explicitly transferring data. 12 | * 13 | * 14 | * Compile: nvcc -O3 -o managed matrixAddWithUnifiedMemory.cu -I.. 15 | * Run: ./managed 16 | * [n]: power to set size of input matrix (default: 12) 17 | *****************************************************************************/ 18 | #include 19 | #include 20 | #include "common/common.h" 21 | 22 | void initialData(float* in, const int size) 23 | { 24 | for (int i = 0; i < size; i++) 25 | in[i] = (rand() & 0xFF) / 10.f; 26 | } 27 | 28 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) 29 | { 30 | float* ia = A; 31 | float* ib = B; 32 | float* ic = C; 33 | 34 | for (int iy = 0; iy < ny; iy++) { 35 | for (int ix = 0; ix < nx; ix++) { 36 | ic[ix] = ia[ix] + ib[ix]; 37 | } 38 | 39 | ia += nx; 40 | ib += nx; 41 | ic += nx; 42 | } 43 | } 44 | 45 | void checkResult(float* hostRef, float* gpuRef, const int size) 46 | { 47 | double epsilon = 1.0e-8; 48 | 49 | for (int i = 0; i < size; i++) { 50 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) { 51 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], gpuRef[i]); 52 | break; 53 | } 54 | } 55 | } 56 | 57 | __global__ 58 | void sumMatrixOnGPU(float* A, float* B, float* C, const int nx, const int ny) 59 | { 60 | unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x; 61 | unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y; 62 | unsigned int idx = iy * nx + ix; 63 | 64 | if (ix < nx && iy < ny) 65 | C[idx] = A[idx] + B[idx]; 66 | } 67 | 68 | int main(int argc, char** argv) 69 | { 70 | // setup device 71 | int dev = 0; 72 | cudaDeviceProp deviceProp; 73 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 74 | printf("Starting sumMatrix at device %d: %s\n", dev, deviceProp.name); 75 | CUDA_CHECK(cudaSetDevice(dev)); 76 | 77 | // setup size of matrix 78 | int nx, ny; 79 | int power = 12; 80 | if (argc > 1) 81 | power = atoi(argv[1]); 82 | nx = ny = 1 << power; 83 | 84 | int nxy = nx * ny; 85 | size_t nBytes = nxy * sizeof(float); 86 | printf("Matrix size: nx %d ny %d\n", nx, ny); 87 | 88 | // malloc host memory 89 | float *A, *B, *hostRef, *gpuRef; 90 | CUDA_CHECK(cudaMallocManaged((void**)&A, nBytes)); 91 | CUDA_CHECK(cudaMallocManaged((void**)&B, nBytes)); 92 | CUDA_CHECK(cudaMallocManaged((void**)&hostRef, nBytes)); 93 | CUDA_CHECK(cudaMallocManaged((void**)&gpuRef, nBytes)); 94 | 95 | double start, finish; 96 | // initialize data at host side 97 | GET_TIME(start); 98 | initialData(A, nxy); 99 | initialData(B, nxy); 100 | GET_TIME(finish); 101 | printf("initialization: \t %f sec\n", finish - start); 102 | 103 | memset(hostRef, 0, nBytes); 104 | memset(gpuRef, 0, nBytes); 105 | 106 | // add matrix at host side for result check 107 | GET_TIME(start); 108 | sumMatrixOnHost(A, B, hostRef, nx, ny); 109 | GET_TIME(finish); 110 | printf("sumMatrix on host:\t %f sec\n", finish - start); 111 | 112 | // invode kernel at host side 113 | int dimX = 32; 114 | int dimY = 32; 115 | dim3 blocks(dimX, dimY); 116 | dim3 grids((nx + blocks.x - 1) / blocks.x, (ny + blocks.y - 1) / blocks.y); 117 | 118 | // warm-up kernel 119 | sumMatrixOnGPU<<>>(A, B, gpuRef, nx, ny); 120 | CUDA_CHECK(cudaDeviceSynchronize()); 121 | 122 | GET_TIME(start); 123 | sumMatrixOnGPU<<>>(A, B, gpuRef, nx, ny); 124 | CUDA_CHECK(cudaDeviceSynchronize()); 125 | GET_TIME(finish); 126 | printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>>\n", finish-start, grids.x, grids.y, blocks.x, blocks.y); 127 | 128 | // check device results 129 | checkResult(hostRef, gpuRef, nxy); 130 | 131 | // free device global memory 132 | CUDA_CHECK(cudaFree(A)); 133 | CUDA_CHECK(cudaFree(B)); 134 | CUDA_CHECK(cudaFree(hostRef)); 135 | CUDA_CHECK(cudaFree(gpuRef)); 136 | 137 | CUDA_CHECK(cudaDeviceReset()); 138 | return 0; 139 | } -------------------------------------------------------------------------------- /CUDA/UnifiedMemory/matrixAddWithoutUnifiedMemory.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: matrixAddWithoutUnifiedMemory.cu 3 | * Description: This is an example to demonstrates the use of explicit CUDA memory 4 | * transfer to implement matrix addition. This code contrasts with 5 | * matrixAddWithUnifiedMemory.cu, where CUDA managed memory is used to 6 | * remove all explicit memory transfers and abstract away the concept 7 | * of physicall separate address space. 8 | * 9 | * 10 | * Compile: nvcc -O3 -o manual matrixAddWithoutUnifiedMemory.cu -I.. 11 | * Run: ./manual 12 | * [n]: power to set size of input matrix (default: 12) 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include "common/common.h" 17 | 18 | void initialData(float* in, const int size) 19 | { 20 | for (int i = 0; i < size; i++) 21 | in[i] = (rand() & 0xFF) / 10.f; 22 | } 23 | 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) 25 | { 26 | float* ia = A; 27 | float* ib = B; 28 | float* ic = C; 29 | 30 | for (int iy = 0; iy < ny; iy++) { 31 | for (int ix = 0; ix < nx; ix++) { 32 | ic[ix] = ia[ix] + ib[ix]; 33 | } 34 | 35 | ia += nx; 36 | ib += nx; 37 | ic += nx; 38 | } 39 | } 40 | 41 | void checkResult(float* hostRef, float* gpuRef, const int size) 42 | { 43 | double epsilon = 1.0e-8; 44 | 45 | for (int i = 0; i < size; i++) { 46 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) { 47 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], gpuRef[i]); 48 | break; 49 | } 50 | } 51 | } 52 | 53 | __global__ 54 | void sumMatrixOnGPU(float* A, float* B, float* C, const int nx, const int ny) 55 | { 56 | unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x; 57 | unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y; 58 | unsigned int idx = iy * nx + ix; 59 | 60 | if (ix < nx && iy < ny) 61 | C[idx] = A[idx] + B[idx]; 62 | } 63 | 64 | int main(int argc, char** argv) 65 | { 66 | // setup device 67 | int dev = 0; 68 | cudaDeviceProp deviceProp; 69 | CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 70 | printf("Starting sumMatrix at device %d: %s\n", dev, deviceProp.name); 71 | CUDA_CHECK(cudaSetDevice(dev)); 72 | 73 | // setup size of matrix 74 | int nx, ny; 75 | int power = 12; 76 | if (argc > 1) 77 | power = atoi(argv[1]); 78 | nx = ny = 1 << power; 79 | 80 | int nxy = nx * ny; 81 | size_t nBytes = nxy * sizeof(float); 82 | printf("Matrix size: nx %d ny %d\n", nx, ny); 83 | 84 | // malloc host memory 85 | float *h_A, *h_B, *hostRef, *gpuRef; 86 | h_A = (float*)malloc(nBytes); 87 | h_B = (float*)malloc(nBytes); 88 | hostRef = (float*)malloc(nBytes); 89 | gpuRef = (float*)malloc(nBytes); 90 | 91 | double start, finish; 92 | // initialize data at host side 93 | GET_TIME(start); 94 | initialData(h_A, nxy); 95 | initialData(h_B, nxy); 96 | GET_TIME(finish); 97 | printf("initialization: \t %f sec\n", finish - start); 98 | 99 | memset(hostRef, 0, nBytes); 100 | memset(gpuRef, 0, nBytes); 101 | 102 | // add matrix at host side for result check 103 | GET_TIME(start); 104 | sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); 105 | GET_TIME(finish); 106 | printf("sumMatrix on host:\t %f sec\n", finish - start); 107 | 108 | // malloc device global memory 109 | float *d_A, *d_B, *d_C; 110 | CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes)); 111 | CUDA_CHECK(cudaMalloc((void**)&d_B, nBytes)); 112 | CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes)); 113 | 114 | // invoke kernel at host side 115 | int dimX = 32; 116 | int dimY = 32; 117 | dim3 blocks(dimX, dimY); 118 | dim3 grids((nx + blocks.x - 1) / blocks.x, (ny + blocks.y - 1) / blocks.y); 119 | 120 | // warm-up kernel 121 | CUDA_CHECK(cudaMemset(d_A, 0.0f, nBytes)); 122 | CUDA_CHECK(cudaMemset(d_B, 0.0f, nBytes)); 123 | sumMatrixOnGPU<<>>(d_A, d_B, d_C, nx, ny); 124 | CUDA_CHECK(cudaDeviceSynchronize()); 125 | 126 | // transfer data from host to device 127 | CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 128 | CUDA_CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); 129 | 130 | GET_TIME(start); 131 | sumMatrixOnGPU<<>>(d_A, d_B, d_C, nx, ny); 132 | CUDA_CHECK(cudaDeviceSynchronize()); 133 | GET_TIME(finish); 134 | printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>>\n", finish-start, grids.x, grids.y, blocks.x, blocks.y); 135 | 136 | CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 137 | 138 | // check device results 139 | checkResult(hostRef, gpuRef, nxy); 140 | 141 | // free device global memory 142 | CUDA_CHECK(cudaFree(d_A)); 143 | CUDA_CHECK(cudaFree(d_B)); 144 | CUDA_CHECK(cudaFree(d_C)); 145 | 146 | // free host memory 147 | free(h_A); 148 | free(h_B); 149 | free(hostRef); 150 | free(gpuRef); 151 | 152 | CUDA_CHECK(cudaDeviceReset()); 153 | return 0; 154 | } -------------------------------------------------------------------------------- /CUDA/bezierCurves/bezierCurves.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define MAX_TESS_POINTS 32 5 | #define N_LINES 256 6 | #define BLOCK_DIM 32 7 | 8 | // A structure containing all paramters needed to tessellate a Bezier line 9 | struct BezierLine 10 | { 11 | float2 CP[3]; // Control Points for the line 12 | float2 vertexPos[MAX_TESS_POINTS]; // Vertex position array to tessellate into 13 | int nVertices; // Number of tessellated vertices 14 | }; 15 | 16 | __forceinline__ __device__ float2 operator+(float2 a, float2 b) 17 | { 18 | float2 c; 19 | c.x = a.x + b.x; 20 | c.y = a.y + b.y; 21 | return c; 22 | } 23 | 24 | __forceinline__ __device__ float2 operator-(float2 a, float2 b) 25 | { 26 | float2 c; 27 | c.x = a.x - b.x; 28 | c.y = a.y - b.y; 29 | return c; 30 | } 31 | 32 | __forceinline__ __device__ float2 operator*(float a, float2 b) 33 | { 34 | float2 c; 35 | c.x = a * b.x; 36 | c.y = a * b.y; 37 | return c; 38 | } 39 | 40 | __forceinline__ __device__ float length(float2 a) 41 | { 42 | return sqrtf((a.x * a.x) + (a.y * a.y)); 43 | } 44 | 45 | __forceinline__ __device__ float computeCurvature(BezierLine *bLines) 46 | { 47 | int bIdx = blockIdx.x; 48 | float curvature = length(bLines[bIdx].CP[1] - 0.5f * (bLines[bIdx].CP[0] + bLines[bIdx].CP[2])) 49 | / length(bLines[bIdx].CP[2] - bLines[bIdx].CP[0]); 50 | 51 | return curvature; 52 | } 53 | 54 | void initializeBLines(BezierLine *bLines_h) 55 | { 56 | float2 last = {0, 0}; 57 | for (int i = 0; i < N_LINES; i++) 58 | { 59 | // Set first point of this line to last point of previous line 60 | bLines_h[i].CP[0] = last; 61 | for (int j = 1; j < 3; j++) 62 | { 63 | // Assign random corrdinate between 0 and 1 64 | bLines_h[i].CP[j].x = (float)rand() / RAND_MAX; 65 | bLines_h[i].CP[j].y = (float)rand() / RAND_MAX; 66 | } 67 | last = bLines_h[i].CP[2]; // keep the last point of this line 68 | // Set numbeer of tessellated vertices to zero 69 | bLines_h[i].nVertices = 0; 70 | } 71 | } -------------------------------------------------------------------------------- /CUDA/bezierCurves/bezierCurves1.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: bezierCurves1.cu 3 | * Description: Implement Bezier Curve Calculation without dynamic parallelism 4 | * 5 | * Compile: nvcc -o bezierCurves1 bezierCurves1.cu -I.. -I. $(pkg-config opencv4 --libs --cflags) 6 | * Run: ./bezierCurves1 7 | * Argument: n.a 8 | *****************************************************************************/ 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | #include 17 | #include "bezierCurves.cuh" 18 | 19 | __global__ void computeBezierLines(BezierLine* bLines, int nLines) 20 | { 21 | int bIdx = blockIdx.x; 22 | if (bIdx < nLines) { 23 | // Compute the curvature of the line 24 | float curvature = computeCurvature(bLines); 25 | 26 | // From the curvature, compute the number of tessellation points 27 | int nTessPoints = min(max((int)(curvature*16.0f), 4), 32); 28 | bLines[bIdx].nVertices = nTessPoints; 29 | 30 | // Loop through vertices to be tessellated, incrementing by blockDim.x 31 | for (int i = 0; i < nTessPoints; i += blockDim.x) { 32 | int idx = i + threadIdx.x; // compute a unique index for this point 33 | if (idx < nTessPoints) { 34 | float u = (float)idx / (nTessPoints - 1); // Compute u from idx 35 | float omu = 1.0f - u; // pre-compute one minus u 36 | float B3u[3]; // Compute quadratic Bezier coefficients 37 | B3u[0] = omu * omu; 38 | B3u[1] = 2.0f * u * omu; 39 | B3u[2] = u * u; 40 | float2 position = {0, 0}; 41 | for (int j = 0; j < 3; j++) { 42 | // Add the contribution of the j'th control point to position 43 | position = position + (B3u[j] * bLines[bIdx].CP[j]); 44 | } 45 | // Assign value of vertex position to the correct array element 46 | bLines[bIdx].vertexPos[idx] = position; 47 | } 48 | } 49 | } 50 | } 51 | 52 | // Main function 53 | int main(int argc, char **argv) 54 | { 55 | CUDA_CHECK(cudaSetDevice(0)); 56 | 57 | BezierLine *bLines_h = new BezierLine[N_LINES]; 58 | initializeBLines(bLines_h); 59 | 60 | BezierLine *bLines_d; 61 | CUDA_CHECK(cudaMalloc((void **)&bLines_d, N_LINES * sizeof(BezierLine))); 62 | CUDA_CHECK(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine), cudaMemcpyHostToDevice)); 63 | 64 | double start, finish; 65 | GET_TIME(start); 66 | computeBezierLines<<>>(bLines_d, N_LINES); 67 | CUDA_CHECK(cudaMemcpy(bLines_h, bLines_d, N_LINES*sizeof(BezierLine), cudaMemcpyDeviceToHost)); 68 | GET_TIME(finish); 69 | 70 | printf("Elapsed time: %.6f msec\n", (finish - start)*1000); 71 | 72 | const int rows = 4; 73 | const int cols = 4; 74 | const int img_width = 196; 75 | cv::Mat dstImage(img_width * (rows + 1), img_width * (cols + 1), CV_8UC3, cv::Scalar(255, 255, 255)); 76 | 77 | int max_points = 0; 78 | const int numberOfdisplay = 16; 79 | for (int i = 0; i < numberOfdisplay; i++) { 80 | const int r = i / cols; 81 | const int c = i % cols; 82 | for (int j = 0; j < 2; j++) { 83 | cv::line(dstImage, 84 | cv::Point((r*img_width) + ((img_width/4) + bLines_h[i].CP[j].x*img_width), (c*img_width) + ((img_width/4) + bLines_h[i].CP[j].y*img_width)), 85 | cv::Point((r*img_width) + ((img_width/4) + bLines_h[i].CP[j+1].x*img_width), (c*img_width) + ((img_width/4) + bLines_h[i].CP[j+1].y*img_width)), 86 | cv::Scalar(0,0,0), 2); 87 | } 88 | 89 | if (bLines_h[i].nVertices > max_points) 90 | max_points = bLines_h[i].nVertices; 91 | } 92 | 93 | 94 | for (int k = 0; k < max_points - 1; k++) { 95 | for (int i = 0; i < numberOfdisplay; i++) { 96 | const int r = i / cols; 97 | const int c = i % cols; 98 | 99 | if (k < bLines_h[i].nVertices - 1) { 100 | cv::line(dstImage, 101 | cv::Point(r*img_width + ((img_width/4) + bLines_h[i].vertexPos[k].x*img_width), c*img_width + ((img_width/4) + bLines_h[i].vertexPos[k].y*img_width)), 102 | cv::Point(r*img_width + ((img_width/4) + bLines_h[i].vertexPos[k+1].x*img_width), c*img_width + ((img_width/4) + bLines_h[i].vertexPos[k+1].y*img_width)), 103 | cv::Scalar(255,0,0), 2); 104 | }; 105 | } 106 | cv::imshow("win", dstImage); 107 | cv::waitKey(500); 108 | } 109 | cv::waitKey(0); 110 | 111 | CUDA_CHECK(cudaFree(bLines_d)); 112 | delete[] bLines_h; 113 | 114 | return 0; 115 | } -------------------------------------------------------------------------------- /CUDA/common/common_string.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_STRING_H__ 2 | #define __COMMON_STRING_H__ 3 | 4 | #include 5 | #include 6 | 7 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 8 | #define strncasecmp _strnicmp 9 | #define strcasecmp strcmpi 10 | #endif 11 | 12 | inline int stringRemoveDelimiter(char delimiter, const char* string) 13 | { 14 | int str_start = 0; 15 | 16 | while (string[str_start] == delimiter) { 17 | str_start++; 18 | } 19 | 20 | if (str_start >= static_cast(strlen(string))) { 21 | return 0; 22 | } 23 | 24 | return str_start; 25 | } 26 | 27 | inline bool checkCmdLineFlag(int argc, const char** argv, const char* str_ref) 28 | { 29 | bool found = false; 30 | 31 | if (argc >= 1) { 32 | for (int i = 1; i < argc; i++) { 33 | int str_start = stringRemoveDelimiter('-', argv[i]); 34 | const char *str_argv = &argv[i][str_start]; 35 | const char *equal_pos = strchr(str_argv, '='); 36 | 37 | int argv_length = static_cast(equal_pos == 0 ? strlen(str_argv) : equal_pos - str_argv); 38 | int length = static_cast(strlen(str_ref)); 39 | 40 | if (length == argv_length && !strncasecmp(str_argv, str_ref, length)) { 41 | found = true; 42 | continue; 43 | } 44 | } 45 | } 46 | 47 | return found; 48 | } 49 | 50 | inline bool getCmdLineArgumentString(int argc, const char** argv, const char* str_ref, char** str_retval) 51 | { 52 | bool found = false; 53 | 54 | if (argc >= 1) { 55 | for (int i = 1; i < argc; i++) { 56 | int str_start = stringRemoveDelimiter('-', argv[i]); 57 | char* str_argv = const_cast(&argv[i][str_start]); 58 | int length = static_cast(strlen(str_ref)); 59 | 60 | if (!strncasecmp(str_argv, str_ref, length)) { 61 | *str_retval = &str_argv[length + 1]; 62 | found = true; 63 | continue; 64 | } 65 | } 66 | } 67 | 68 | if (!found) 69 | *str_retval = NULL; 70 | 71 | return found; 72 | } 73 | 74 | inline int getCmdLineArgumentInt(int argc, const char** argv, const char* str_ref) 75 | { 76 | bool found = false; 77 | int value = -1; 78 | 79 | if (argc >= 1) { 80 | for (int i = 1; i < argc; i++) { 81 | int str_start = stringRemoveDelimiter('-', argv[i]); 82 | char* str_argv = const_cast(&argv[i][str_start]); 83 | int length = static_cast(strlen(str_ref)); 84 | 85 | if (!strncasecmp(str_argv, str_ref, length)) { 86 | if (length + 1 <= static_cast(strlen(str_argv))) { 87 | int auto_inc = (str_argv[length] == '=') ? 1 : 0; 88 | value = strtol(&str_argv[length + auto_inc], NULL, 10); 89 | } 90 | else { 91 | value = 0; 92 | } 93 | } 94 | 95 | found = true; 96 | continue; 97 | } 98 | } 99 | 100 | if (found) 101 | return value; 102 | else 103 | return 0; 104 | } 105 | 106 | #endif -------------------------------------------------------------------------------- /CUDA/deviceQuery/simpleDeviceQuery.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: simpleDeviceQuery.cu 3 | * Description: Query device information 4 | * 5 | * Compile: nvcc -o simpleDeviceQuery simpleDeviceQuery.cu 6 | * Run: ./simpleDeviceQuery 7 | *****************************************************************************/ 8 | #include 9 | #include 10 | 11 | int main(int argc, char** argv) 12 | { 13 | int dev = 0; 14 | cudaDeviceProp devProp; 15 | cudaGetDeviceProperties(&devProp, dev); 16 | 17 | printf("Device %d: %s\n", dev, devProp.name); 18 | printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount); 19 | printf("Total amount of constant memory: %4.2f KB\n", devProp.totalConstMem/1024.0); 20 | printf("Total amount of shared memory per block: %4.2f KB\n", devProp.sharedMemPerBlock/1024.0); 21 | printf("Total number of registers available per block: %d\n", devProp.regsPerBlock); 22 | printf("Warp size: %d\n", devProp.warpSize); 23 | printf("Maximum number of threads per multiprocessor: %d\n", devProp.maxThreadsPerMultiProcessor); 24 | printf("Maximum number of warps per multiprocessor: %d\n", devProp.maxThreadsPerMultiProcessor/devProp.warpSize); 25 | 26 | return 0; 27 | } -------------------------------------------------------------------------------- /CUDA/imageProcessing/convertColorToGrey.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: convertColorToGrey.cu 3 | * Description: Convert color scale to grey scale of input image. 4 | * This program doesn't save result image, and just show the result. 5 | * For reading image, OpenCV library should be used. 6 | * 7 | * Compile: nvcc -o convertColorToGrey convertColorToGrey.cu -I.. -lcuda $(pkg-config opencv4 --libs --cflags) 8 | * Run: ./convertColorToGrey 9 | *****************************************************************************/ 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #define CHANNELS 3 19 | 20 | void Usage(char prog_name[]); 21 | __global__ 22 | void colorToGreyscaleConversion(unsigned char* in, unsigned char* out, int width, int height); 23 | 24 | int main(int argc, char** argv) 25 | { 26 | if (argc != 2) { 27 | Usage(argv[0]); 28 | } 29 | 30 | const char* file_name = argv[1]; 31 | int width, height, channels; 32 | unsigned char *h_origImg, *h_resultImg; 33 | // open image file 34 | cv::Mat origImg = cv::imread(file_name); 35 | 36 | width = origImg.cols; 37 | height = origImg.rows; 38 | channels = origImg.channels(); 39 | printf("Image size = (%d x %d x %d)\n", width, height, channels); 40 | assert(channels == CHANNELS); 41 | 42 | cv::Mat half; 43 | cv::resize(origImg, half, cv::Size(width/2, height/2)); 44 | cv::imshow("image", half); 45 | cv::waitKey(0); 46 | 47 | h_origImg = (unsigned char*)malloc(width * height * channels * sizeof(unsigned char)); 48 | h_resultImg = (unsigned char*)malloc(width * height * sizeof(unsigned char)); 49 | (void)memcpy(h_origImg, origImg.data, width * height * channels); 50 | 51 | unsigned char *d_origImg, *d_resultImg; 52 | CUDA_CHECK(cudaMalloc((void**)&d_origImg, width * height * channels * sizeof(unsigned char))); 53 | CUDA_CHECK(cudaMalloc((void**)&d_resultImg, width * height * sizeof(unsigned char))); 54 | 55 | // Copy the host input in host memory to the device input in device memory 56 | CUDA_CHECK(cudaMemcpy(d_origImg, h_origImg, width * height * channels * sizeof(unsigned char), cudaMemcpyHostToDevice)); 57 | 58 | // Launch the Kernel 59 | const int block_size = 16; 60 | dim3 threads(block_size, block_size); 61 | dim3 grid(ceil(width / (double)threads.x), ceil(height / (double)threads.y)); 62 | colorToGreyscaleConversion<<>>(d_origImg, d_resultImg, width, height); 63 | 64 | // Copy the device result in device memory to the host result in host memory 65 | CUDA_CHECK(cudaMemcpy(h_resultImg, d_resultImg, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost)); 66 | 67 | cv::Mat resultImg(height, width, CV_8UC1); 68 | memcpy(resultImg.data, h_resultImg, width * height); 69 | 70 | // Free device global memory 71 | CUDA_CHECK(cudaFree(d_origImg)); 72 | CUDA_CHECK(cudaFree(d_resultImg)); 73 | 74 | // Free host memory 75 | free(h_origImg); 76 | free(h_resultImg); 77 | 78 | // show result 79 | //cv::Mat resizeImg; 80 | cv::resize(resultImg, resultImg, cv::Size(width/2, height/2)); 81 | cv::imshow("image", resultImg); 82 | cv::waitKey(0); 83 | 84 | return 0; 85 | } 86 | 87 | void Usage(char prog_name[]) 88 | { 89 | fprintf(stderr, "Usage: %s \n", prog_name); 90 | exit(EXIT_FAILURE); 91 | } 92 | 93 | // Input image has 3 channels corresponding to RGB 94 | // The input image is encoded as unsigned characters [0, 255] 95 | __global__ 96 | void colorToGreyscaleConversion(unsigned char* in, unsigned char* out, int width, int height) 97 | { 98 | int Row = blockIdx.y * blockDim.y + threadIdx.y; 99 | int Col = blockIdx.x * blockDim.x + threadIdx.x; 100 | 101 | if (Row < height && Col < width) { 102 | int offset = Row*width + Col; 103 | int rgbOffset = offset*CHANNELS; 104 | 105 | unsigned char r = in[rgbOffset]; // red value for pixel 106 | unsigned char g = in[rgbOffset + 1]; // green value for pixel 107 | unsigned char b = in[rgbOffset + 2]; // blue value for pixel 108 | 109 | out[offset] = 0.21f * r + 0.71f * g + 0.07f * b; 110 | } 111 | } -------------------------------------------------------------------------------- /CUDA/imageProcessing/imageBlur.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: imageBlur.cu 3 | * Description: Blur input image using 3D blocks. 4 | * This program doesn't save result image, and just show the result. 5 | * For reading image, OpenCV library should be used. 6 | * 7 | * Compile: nvcc -o imageBlur imageBlur.cu -I.. -lcuda $(pkg-config opencv4 --libs --cflags) 8 | * Run: ./imageBlur 9 | *****************************************************************************/ 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #define CHANNELS 3 19 | #define BLUR_SIZE 10 20 | 21 | void Usage(char prog_name[]); 22 | __global__ 23 | void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int channel); 24 | 25 | int main(int argc, char** argv) 26 | { 27 | if (argc != 2) { 28 | Usage(argv[0]); 29 | } 30 | 31 | const char* file_name = argv[1]; 32 | int width, height, channels; 33 | unsigned char *h_origImg, *h_resultImg; 34 | // open image file 35 | cv::Mat origImg = cv::imread(file_name); 36 | 37 | width = origImg.cols; 38 | height = origImg.rows; 39 | channels = origImg.channels(); 40 | printf("Image size = (%d x %d x %d)\n", width, height, channels); 41 | assert(channels == CHANNELS); 42 | 43 | cv::Mat half; 44 | cv::resize(origImg, half, cv::Size(width/2, height/2)); 45 | cv::imshow("image", half); 46 | cv::waitKey(0); 47 | 48 | h_origImg = (unsigned char*)malloc(width * height * channels * sizeof(unsigned char)); 49 | h_resultImg = (unsigned char*)malloc(width * height * channels * sizeof(unsigned char)); 50 | (void)memcpy(h_origImg, origImg.data, width * height * channels); 51 | 52 | unsigned char *d_origImg, *d_resultImg; 53 | CUDA_CHECK(cudaMalloc((void**)&d_origImg, width * height * channels * sizeof(unsigned char))); 54 | CUDA_CHECK(cudaMalloc((void**)&d_resultImg, width * height * channels * sizeof(unsigned char))); 55 | 56 | // Copy the host input in host memory to the device input in device memory 57 | CUDA_CHECK(cudaMemcpy(d_origImg, h_origImg, width * height * channels * sizeof(unsigned char), cudaMemcpyHostToDevice)); 58 | 59 | // Launch the blur Kernel 60 | const int block_size = 16; 61 | dim3 threads(block_size, block_size, channels); 62 | dim3 grid(ceil(width / (double)threads.x), ceil(height / (double)threads.y)); 63 | blurKernel<<>>(d_origImg, d_resultImg, width, height, channels); 64 | 65 | // Copy the device result in device memory to the host result in host memory 66 | CUDA_CHECK(cudaMemcpy(h_resultImg, d_resultImg, width * height * channels * sizeof(unsigned char), cudaMemcpyDeviceToHost)); 67 | 68 | cv::Mat resultImg(height, width, CV_8UC3); 69 | memcpy(resultImg.data, h_resultImg, width * height * channels); 70 | 71 | // Free device global memory 72 | CUDA_CHECK(cudaFree(d_origImg)); 73 | CUDA_CHECK(cudaFree(d_resultImg)); 74 | 75 | // Free host memory 76 | free(h_origImg); 77 | free(h_resultImg); 78 | 79 | // show result 80 | //cv::Mat resizeImg; 81 | cv::resize(resultImg, resultImg, cv::Size(width/2, height/2)); 82 | cv::imshow("image", resultImg); 83 | cv::waitKey(0); 84 | 85 | return 0; 86 | } 87 | 88 | void Usage(char prog_name[]) 89 | { 90 | fprintf(stderr, "Usage: %s \n", prog_name); 91 | exit(EXIT_FAILURE); 92 | } 93 | 94 | __global__ 95 | void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int channel) 96 | { 97 | int Plane = blockIdx.z * blockDim.z + threadIdx.z; 98 | int Row = blockIdx.y * blockDim.y + threadIdx.y; 99 | int Col = blockIdx.x * blockDim.x + threadIdx.x; 100 | 101 | if (Row < height && Col < width && Plane < channel) { 102 | int pixelVal = 0; 103 | int pixelCnt = 0; 104 | 105 | for (int bRow = -BLUR_SIZE; bRow < BLUR_SIZE; bRow++) { 106 | for (int bCol = -BLUR_SIZE; bCol < BLUR_SIZE; bCol++) { 107 | int curRow = Row + bRow; 108 | int curCol = Col + bCol; 109 | 110 | if (curRow >= 0 && curRow < height && curCol >= 0 && curCol < width) { 111 | pixelVal += in[(curRow * width + curCol) * channel + Plane]; 112 | pixelCnt++; 113 | } 114 | } 115 | } 116 | 117 | out[(Row * width + Col) * channel + Plane] = (unsigned char)(pixelVal / pixelCnt); 118 | } 119 | } -------------------------------------------------------------------------------- /CUDA/imageProcessing/lena.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/junstar92/parallel_programming_study/9886cc1e6f630c7b89b402ad2ffa60653ce8edfa/CUDA/imageProcessing/lena.jpg -------------------------------------------------------------------------------- /CUDA/matrixAdd/matrixAdd.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: matrixAdd.cu 3 | * Description: Matrix addition, C = A + B 4 | * A,B and C have m x n dimensions. 5 | * 6 | * Compile: nvcc -o matrixAdd matrixAdd.cu -I.. -lcuda 7 | * Run: ./matrixAdd 8 | * : the number of rows in Matrix A, B 9 | * : the number of columns in Matrix A, B. 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | void Usage(char prog_name[]); 17 | __global__ void matrixAdd(const float *A, const float *B, float *C, const int M, const int N); 18 | 19 | int main(int argc, char* argv[]) 20 | { 21 | if (argc != 3) { 22 | Usage(argv[0]); 23 | } 24 | 25 | int m = strtol(argv[1], NULL, 10); 26 | int n = strtol(argv[2], NULL, 10); 27 | printf("[Matrix addition, C = A + B]\n"); 28 | printf("\tA, B, and C are (%d x %d) matrix\n", m, n); 29 | 30 | // Allocate the host matrix A, B, C 31 | float *h_A = (float*)malloc(m * n * sizeof(float)); 32 | float *h_B = (float*)malloc(m * n * sizeof(float)); 33 | float *h_C = (float*)malloc(m * n * sizeof(float)); 34 | 35 | // Verify that allocations succeeded 36 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 37 | fprintf(stderr, "Failed to allocate host matrix!\n"); 38 | exit(EXIT_SUCCESS); 39 | } 40 | 41 | // Initialize that host matrix 42 | common_random_init_matrix(h_A, m, n); 43 | common_random_init_matrix(h_B, m, n); 44 | 45 | // Allocate the device matrix A, B, C 46 | float *d_A, *d_B, *d_C; 47 | CUDA_CHECK(cudaMalloc((void**)&d_A, m * n * sizeof(float))); 48 | CUDA_CHECK(cudaMalloc((void**)&d_B, m * n * sizeof(float))); 49 | CUDA_CHECK(cudaMalloc((void**)&d_C, m * n * sizeof(float))); 50 | 51 | // Copy the host input matrix A and B in host memory 52 | // to the device input matrix in device memory 53 | printf("Copy input data from the host memory to the CUDA device\n"); 54 | CUDA_CHECK(cudaMemcpy(d_A, h_A, m * n * sizeof(float), cudaMemcpyHostToDevice)); 55 | CUDA_CHECK(cudaMemcpy(d_B, h_B, m * n * sizeof(float), cudaMemcpyHostToDevice)); 56 | 57 | // Allocate CUDA events for estimating 58 | cudaEvent_t start, stop; 59 | CUDA_CHECK(cudaEventCreate(&start)); 60 | CUDA_CHECK(cudaEventCreate(&stop)); 61 | 62 | // Launch the Matrix Add CUDA Kernel 63 | const int block_size = 16; 64 | dim3 threads(block_size, block_size); 65 | dim3 grid(ceil(m / (float)threads.x), ceil(n / (float)threads.y)); 66 | printf("CUDA kernel launch with (%d x %d) blocks of (%d x %d) threads\n", grid.x, grid.y, threads.x, threads.y); 67 | 68 | CUDA_CHECK(cudaDeviceSynchronize()); 69 | CUDA_CHECK(cudaEventRecord(start)); 70 | 71 | matrixAdd<<>>(d_A, d_B, d_C, m, n); 72 | 73 | CUDA_CHECK(cudaDeviceSynchronize()); 74 | CUDA_CHECK(cudaEventRecord(stop)); 75 | 76 | 77 | // Copy the device result matrix in device memory 78 | // to the host result matrix in host memory 79 | printf("Copy output data from the CUDA device to the host memory\n"); 80 | CUDA_CHECK(cudaMemcpy(h_C, d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost)); 81 | 82 | // Verify that the result matrix is correct 83 | common_verify_matAdd(h_A, h_B, h_C, m, n); 84 | 85 | // Compute and Print the performance 86 | COMPUTE_MATADD_PERFORMANCE(start, stop, m, n, threads.x * threads.y); 87 | 88 | // Free device global memory 89 | CUDA_CHECK(cudaFree(d_A)); 90 | CUDA_CHECK(cudaFree(d_B)); 91 | CUDA_CHECK(cudaFree(d_C)); 92 | CUDA_CHECK(cudaEventDestroy(start)); 93 | CUDA_CHECK(cudaEventDestroy(stop)); 94 | 95 | // Free host memory 96 | free(h_A); 97 | free(h_B); 98 | free(h_C); 99 | 100 | printf("Done\n"); 101 | 102 | return 0; 103 | } 104 | 105 | void Usage(char prog_name[]) 106 | { 107 | fprintf(stderr, "Usage: %s \n", prog_name); 108 | fprintf(stderr, "\t : the number of rows in matrix A, B.\n"); 109 | fprintf(stderr, "\t : the number of columns in matrix A, B.\n"); 110 | exit(EXIT_FAILURE); 111 | } 112 | 113 | __global__ 114 | void matrixAdd(const float *A, const float *B, float *C, const int M, const int N) 115 | { 116 | int ROW = blockIdx.x * blockDim.x + threadIdx.x; 117 | int COL = blockIdx.y * blockDim.y + threadIdx.y; 118 | 119 | if (ROW < M && COL < N) { 120 | C[(ROW * N) + COL] = A[(ROW * N) + COL] + B[(ROW * N) + COL]; 121 | } 122 | } -------------------------------------------------------------------------------- /CUDA/matrixAdd/matrixAdd2.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: matrixAdd2.cu 3 | * Description: Matrix addition, C = A + B 4 | * A,B and C have 2^14 x 2^14 dimensions. 5 | * 6 | * Compile: nvcc -O3 -o matrixAdd2 matrixAdd2.cu -I.. 7 | * Run: ./matrixAdd2 8 | *****************************************************************************/ 9 | #include 10 | #include 11 | #include 12 | 13 | void initialData(float* p, const int size) 14 | { 15 | for (int i = 0; i < size; i++) { 16 | p[i] = (float)(rand() & 0xFF) / 10.0f; 17 | } 18 | } 19 | 20 | void sumMatrixOnHost(float* A, float* B, float* C, const int nx, const int ny) 21 | { 22 | for (int y = 0; y < ny; y++) { 23 | for (int x = 0; x < nx; x++) { 24 | C[y*nx + x] = A[y*nx + x] + B[y*nx + x]; 25 | } 26 | } 27 | } 28 | 29 | void checkResult(float* hostRef, float* gpuRef, const int N) 30 | { 31 | double epsilon = 1.0e-8; 32 | 33 | for (int i = 0; i < N; i++) { 34 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) { 35 | printf("host %f gpu %f ", hostRef[i], gpuRef[i]); 36 | printf("Arrays do not match.\n\n"); 37 | break; 38 | } 39 | } 40 | } 41 | 42 | // grid 2D block 2D 43 | __global__ 44 | void sumMatrixOnGPU2D(float* A, float* B, float* C, int NX, int NY) 45 | { 46 | unsigned int ix = blockDim.x*blockIdx.x + threadIdx.x; 47 | unsigned int iy = blockDim.y*blockIdx.y + threadIdx.y; 48 | unsigned int idx = iy*NX + ix; 49 | 50 | if (ix < NX && iy < NY) { 51 | C[idx] = A[idx] + B[idx]; 52 | } 53 | } 54 | 55 | int main(int argc, char** argv) 56 | { 57 | // setup device 58 | int dev = 0; 59 | cudaDeviceProp devProp; 60 | CUDA_CHECK(cudaGetDeviceProperties(&devProp, dev)); 61 | CUDA_CHECK(cudaSetDevice(dev)); 62 | 63 | // setup data size of matrix 64 | int nx = 1 << 14; 65 | int ny = 1 << 14; 66 | int nxy = nx * ny; 67 | int nBytes = nxy * sizeof(float); 68 | 69 | // malloc host memory 70 | float *h_A, *h_B, *hostRef, *gpuRef; 71 | h_A = (float*)malloc(nBytes); 72 | h_B = (float*)malloc(nBytes); 73 | hostRef = (float*)malloc(nBytes); 74 | gpuRef = (float*)malloc(nBytes); 75 | 76 | // initialize data at host 77 | initialData(h_A, nxy); 78 | initialData(h_B, nxy); 79 | memset(hostRef, 0, nBytes); 80 | memset(gpuRef, 0, nBytes); 81 | 82 | double start, finish; 83 | // add matrix at host for result 84 | GET_TIME(start); 85 | //sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); 86 | GET_TIME(finish); 87 | //printf("sumMatrixOnHost elapsed %f ms\n", (finish-start)*1000.f); 88 | 89 | // malloc device global memory 90 | float *d_A, *d_B, *d_C; 91 | CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes)); 92 | CUDA_CHECK(cudaMalloc((void**)&d_B, nBytes)); 93 | CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes)); 94 | 95 | // transfer data from host to device 96 | CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 97 | CUDA_CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); 98 | 99 | // invoke kernel at host 100 | int dimx = 32; 101 | int dimy = 32; 102 | 103 | if (argc > 2) { 104 | dimx = atoi(argv[1]); 105 | dimy = atoi(argv[2]); 106 | } 107 | 108 | dim3 block(dimx, dimy); 109 | dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); 110 | 111 | // execute the kernel 112 | CUDA_CHECK(cudaDeviceSynchronize()); 113 | GET_TIME(start); 114 | sumMatrixOnGPU2D<<>>(d_A, d_B, d_C, nx, ny); 115 | CUDA_CHECK(cudaDeviceSynchronize()); 116 | GET_TIME(finish); 117 | printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f ms\n", grid.x, grid.y, block.x, block.y, (finish-start)*1000.f); 118 | CUDA_CHECK(cudaGetLastError()); 119 | 120 | // copy kernel result back to host 121 | CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 122 | 123 | // check device result 124 | //checkResult(hostRef, gpuRef, nxy); 125 | 126 | // free device global memory 127 | CUDA_CHECK(cudaFree(d_A)); 128 | CUDA_CHECK(cudaFree(d_B)); 129 | CUDA_CHECK(cudaFree(d_C)); 130 | 131 | // free host memory 132 | free(h_A); 133 | free(h_B); 134 | free(hostRef); 135 | free(gpuRef); 136 | 137 | // reset device 138 | CUDA_CHECK(cudaDeviceReset()); 139 | 140 | return 0; 141 | } -------------------------------------------------------------------------------- /CUDA/matrixMul/matrixMul.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: matrixMul.cu 3 | * Description: Matrix multiplication, C = AB 4 | * A has m x k dimensions, B has k x n dimensions, and C has 5 | * m x n dimensions. 6 | * It is not for the most performance. 7 | * 8 | * Compile: nvcc -o matrixMul matrixMul.cu -I.. -lcuda 9 | * Run: ./matrixMul 10 | * : the number of rows in Matrix A 11 | * : the number of columns in Matrix A, it is also 12 | * the number of rows in Matrix B. 13 | * : the number of columns in Matrix B. 14 | *****************************************************************************/ 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | void Usage(char prog_name[]); 21 | __global__ void matrixMul(const float *A, const float *B, float *C, const int M, const int K, const int N); 22 | 23 | int main(int argc, char* argv[]) 24 | { 25 | if (argc != 4) { 26 | Usage(argv[0]); 27 | } 28 | 29 | int m = strtol(argv[1], NULL, 10); 30 | int k = strtol(argv[2], NULL, 10); 31 | int n = strtol(argv[3], NULL, 10); 32 | printf("[Matrix multiplication, C = AB]\n"); 33 | printf("\tA is (%d x %d) matrix, B is (%d x %d) matrix, and \n", m, k, k, n); 34 | printf("\tC is (%d x %d) matrix.\n", m, n); 35 | 36 | // Allocate the host matrix A, B, C 37 | float *h_A = (float*)malloc(m * k * sizeof(float)); 38 | float *h_B = (float*)malloc(k * n * sizeof(float)); 39 | float *h_C = (float*)malloc(m * n * sizeof(float)); 40 | 41 | // Verify that allocations succeeded 42 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 43 | fprintf(stderr, "Failed to allocate host matrix!\n"); 44 | exit(EXIT_SUCCESS); 45 | } 46 | 47 | // Initialize that host matrix 48 | common_random_init_matrix(h_A, m, k); 49 | common_random_init_matrix(h_B, k, n); 50 | 51 | // Allocate the device matrix A, B, C 52 | float *d_A, *d_B, *d_C; 53 | CUDA_CHECK(cudaMalloc((void**)&d_A, m * k * sizeof(float))); 54 | CUDA_CHECK(cudaMalloc((void**)&d_B, k * n * sizeof(float))); 55 | CUDA_CHECK(cudaMalloc((void**)&d_C, m * n * sizeof(float))); 56 | 57 | // Copy the host input matrix A and B in host memory 58 | // to the device input matrix in device memory 59 | printf("Copy input data from the host memory to the CUDA device\n"); 60 | CUDA_CHECK(cudaMemcpy(d_A, h_A, m * k * sizeof(float), cudaMemcpyHostToDevice)); 61 | CUDA_CHECK(cudaMemcpy(d_B, h_B, k * n * sizeof(float), cudaMemcpyHostToDevice)); 62 | 63 | // Allocate CUDA events for estimating 64 | cudaEvent_t start, stop; 65 | CUDA_CHECK(cudaEventCreate(&start)); 66 | CUDA_CHECK(cudaEventCreate(&stop)); 67 | 68 | // Launch the Matrix Multiplication CUDA Kernel 69 | const int block_size = 16; 70 | dim3 threads(block_size, block_size); 71 | dim3 grid(ceil(m / (float)threads.x), ceil(n / (float)threads.y)); 72 | printf("CUDA kernel launch with (%d x %d) blocks of (%d x %d) threads\n", grid.x, grid.y, threads.x, threads.y); 73 | 74 | CUDA_CHECK(cudaDeviceSynchronize()); 75 | CUDA_CHECK(cudaEventRecord(start)); 76 | 77 | matrixMul<<>>(d_A, d_B, d_C, m, k, n); 78 | 79 | CUDA_CHECK(cudaDeviceSynchronize()); 80 | CUDA_CHECK(cudaEventRecord(stop)); 81 | 82 | 83 | // Copy the device result matrix in device memory 84 | // to the host result matrix in host memory 85 | printf("Copy output data from the CUDA device to the host memory\n"); 86 | CUDA_CHECK(cudaMemcpy(h_C, d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost)); 87 | 88 | // Verify that the result matrix is correct (L2-norm error) 89 | common_verify_matMul_l2ne(h_A, h_B, h_C, m, k, n); 90 | 91 | // Compute and Print the performance 92 | COMPUTE_MATMUL_PERFORMANCE(start, stop, m, k, n, threads.x * threads.y); 93 | 94 | // Free device global memory 95 | CUDA_CHECK(cudaFree(d_A)); 96 | CUDA_CHECK(cudaFree(d_B)); 97 | CUDA_CHECK(cudaFree(d_C)); 98 | CUDA_CHECK(cudaEventDestroy(start)); 99 | CUDA_CHECK(cudaEventDestroy(stop)); 100 | 101 | // Free host memory 102 | free(h_A); 103 | free(h_B); 104 | free(h_C); 105 | 106 | printf("Done\n"); 107 | 108 | return 0; 109 | } 110 | 111 | void Usage(char prog_name[]) 112 | { 113 | fprintf(stderr, "Usage: %s \n", prog_name); 114 | fprintf(stderr, "\t : the number of rows in matrix A.\n"); 115 | fprintf(stderr, "\t : the number of columns in Matrix A, it is also\n"); 116 | fprintf(stderr, "\t the number of rows in Matrix B.\n"); 117 | fprintf(stderr, "\t : the number of columns in matrix B.\n"); 118 | exit(EXIT_FAILURE); 119 | } 120 | 121 | __global__ 122 | void matrixMul(const float *A, const float *B, float *C, const int M, const int K, const int N) 123 | { 124 | int Row = blockIdx.y * blockDim.y + threadIdx.y; 125 | int Col = blockIdx.x * blockDim.x + threadIdx.x; 126 | 127 | if (Row < M && Col < N) { 128 | float value = 0.0; 129 | for (int i = 0; i < K; i++) { 130 | value += A[(Row * K) + i] * B[(N * i) + Col]; 131 | } 132 | C[(Row * N) + Col] = value; 133 | } 134 | } -------------------------------------------------------------------------------- /CUDA/reduction/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef __REDUCTION_H__ 2 | #define __REDUCTION_H__ 3 | 4 | template 5 | void reduce(int size, int threads, int blocks, int smemSize, 6 | int whichKernel, T *d_in, T *d_out); 7 | 8 | #endif -------------------------------------------------------------------------------- /CUDA/simpleDivergence/simpleDivergence.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: simpleDivergence.cu 3 | * Description: Measure the performance of some kernels. 4 | * One has warp divergence and others doesn't have warp divergence. 5 | * 6 | * Compile: nvcc -g -G -arch=sm_75 -o simpleDivergence simpleDivergence.cu -I.. 7 | * Run: ./simpleDivergence 8 | * Argument: n.a 9 | *****************************************************************************/ 10 | #include 11 | #include 12 | #include 13 | 14 | __global__ void mathKernel1(float* c) 15 | { 16 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 17 | float a, b; 18 | a = b = 0.0f; 19 | 20 | if (tid % 2 == 0) { 21 | a = 100.0f; 22 | } 23 | else { 24 | b = 200.0f; 25 | } 26 | c[tid] = a + b; 27 | } 28 | 29 | __global__ void mathKernel2(float* c) 30 | { 31 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 32 | float a, b; 33 | a = b = 0.0f; 34 | 35 | if ((tid / warpSize) % 2 == 0) { 36 | a = 100.0f; 37 | } 38 | else { 39 | b = 200.0f; 40 | } 41 | c[tid] = a + b; 42 | } 43 | 44 | __global__ void mathKernel3(float* c) 45 | { 46 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 47 | float a, b; 48 | a = b = 0.0f; 49 | 50 | bool pred = (tid % 2 == 0); 51 | 52 | if (pred) { 53 | a = 100.0f; 54 | } 55 | 56 | if (!pred) { 57 | b = 200.0f; 58 | } 59 | 60 | c[tid] = a + b; 61 | } 62 | 63 | __global__ void mathKernel4(float* c) 64 | { 65 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 66 | float a, b; 67 | a = b = 0.0f; 68 | 69 | int itid = tid >> 5; 70 | 71 | if (itid & 0x01 == 0) { 72 | a = 100.0f; 73 | } 74 | else { 75 | b = 200.0f; 76 | } 77 | 78 | c[tid] = a + b; 79 | } 80 | 81 | __global__ void warmingup(float *c) 82 | { 83 | int tid = blockDim.x * blockIdx.x + threadIdx.x; 84 | float a, b; 85 | a = b = 0.0f; 86 | 87 | if ((tid / warpSize) % 2 == 0) { 88 | a = 100.0f; 89 | } 90 | else { 91 | b = 200.0f; 92 | } 93 | c[tid] = a + b; 94 | } 95 | 96 | int main(int argc, char** argv) 97 | { 98 | // set up device 99 | int dev = 0; 100 | cudaDeviceProp deviceProp; 101 | cudaGetDeviceProperties(&deviceProp, dev); 102 | printf("%s using Device %d: %s\n", argv[0], dev, deviceProp.name); 103 | 104 | // set up data size 105 | int size = 64; 106 | int blockSize = 64; 107 | if (argc > 1) 108 | blockSize = atoi(argv[1]); 109 | if (argc > 2) 110 | size = atoi(argv[2]); 111 | printf("Data size: %d ", size); 112 | 113 | // set up execution configuration 114 | dim3 block(blockSize, 1); 115 | dim3 grid((size+block.x-1) / block.x, 1); 116 | printf("Excution Configure (block %d grid %d)\n", block.x, grid.x); 117 | 118 | // allocate gpu memory 119 | float *d_C; 120 | size_t nBytes = size * sizeof(float); 121 | cudaMalloc((void**)&d_C, nBytes); 122 | 123 | double start, finish; 124 | // run a warmup kernel to remove overhead 125 | cudaDeviceSynchronize(); 126 | GET_TIME(start); 127 | warmingup<<>>(d_C); 128 | cudaDeviceSynchronize(); 129 | GET_TIME(finish); 130 | printf("warmup <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start); 131 | 132 | // run kernel 1 133 | GET_TIME(start); 134 | mathKernel1<<>>(d_C); 135 | cudaDeviceSynchronize(); 136 | GET_TIME(finish); 137 | printf("mathKernel1 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start); 138 | 139 | // run kernel 2 140 | GET_TIME(start); 141 | mathKernel2<<>>(d_C); 142 | cudaDeviceSynchronize(); 143 | GET_TIME(finish); 144 | printf("mathKernel2 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start); 145 | 146 | // run kernel 3 147 | GET_TIME(start); 148 | mathKernel3<<>>(d_C); 149 | cudaDeviceSynchronize(); 150 | GET_TIME(finish); 151 | printf("mathKernel3 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start); 152 | 153 | // run kernel 4 154 | GET_TIME(start); 155 | mathKernel4<<>>(d_C); 156 | cudaDeviceSynchronize(); 157 | GET_TIME(finish); 158 | printf("mathKernel4 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start); 159 | 160 | 161 | // free gpu memory and reset device 162 | cudaFree(d_C); 163 | cudaDeviceReset(); 164 | 165 | return 0; 166 | } -------------------------------------------------------------------------------- /CUDA/vectorAdd/vectorAdd.cu: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: vectorAdd.cu 3 | * Description: Vector addition, C = A + B 4 | * This code is a basic sample that implements element by element 5 | * vector addition. 6 | * 7 | * Compile: nvcc -o vectorAdd vectorAdd.cu -I.. -lcuda 8 | * Run: ./vectorAdd 9 | * : the number of elements in vector 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | void Usage(char prog_name[]); 17 | void vecAdd(const float *h_A, const float *h_B, float *h_C, int numElements); 18 | __global__ void vecAddKernel(const float *A, const float *B, float *C, int numElements); 19 | 20 | int main(int argc, char* argv[]) 21 | { 22 | if (argc != 2) { 23 | Usage(argv[0]); 24 | } 25 | 26 | int numElements = strtol(argv[1], NULL, 10); 27 | printf("[Vector addition of %d elements]\n", numElements); 28 | 29 | // Allocate the host input vectors A, B, C 30 | float *h_A = (float*)malloc(numElements * sizeof(float)); 31 | float *h_B = (float*)malloc(numElements * sizeof(float)); 32 | float *h_C = (float*)malloc(numElements * sizeof(float)); 33 | 34 | // Verify that allocations succeeded 35 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 36 | fprintf(stderr, "Failed to allocate host vectors!\n"); 37 | exit(EXIT_SUCCESS); 38 | } 39 | 40 | // Initialize that host input vectors 41 | common_random_init_vector(h_A, numElements); 42 | common_random_init_vector(h_B, numElements); 43 | 44 | // call vecAdd function 45 | vecAdd(h_A, h_B, h_C, numElements); 46 | 47 | // Free host memory 48 | free(h_A); 49 | free(h_B); 50 | free(h_C); 51 | 52 | printf("Done\n"); 53 | return 0; 54 | } 55 | 56 | void Usage(char prog_name[]) 57 | { 58 | fprintf(stderr, "Usage: %s \n", prog_name); 59 | fprintf(stderr, "\t : the number of elements in vector\n"); 60 | exit(EXIT_FAILURE); 61 | } 62 | 63 | void vecAdd(const float *h_A, const float *h_B, float *h_C, int numElements) 64 | { 65 | // Allocate the device input vectors A, B, C 66 | float *d_A, *d_B, *d_C; 67 | CUDA_CHECK(cudaMalloc((void**)&d_A, numElements * sizeof(float))); 68 | CUDA_CHECK(cudaMalloc((void**)&d_B, numElements * sizeof(float))); 69 | CUDA_CHECK(cudaMalloc((void**)&d_C, numElements * sizeof(float))); 70 | 71 | // Copy the host input vector A and B in host memory 72 | // to the device input vectors in device memory 73 | printf("Copy input data from the host memory to the CUDA device\n"); 74 | CUDA_CHECK(cudaMemcpy(d_A, h_A, numElements * sizeof(float), cudaMemcpyHostToDevice)); 75 | CUDA_CHECK(cudaMemcpy(d_B, h_B, numElements * sizeof(float), cudaMemcpyHostToDevice)); 76 | 77 | // Allocate CUDA events for estimating 78 | cudaEvent_t start, stop; 79 | CUDA_CHECK(cudaEventCreate(&start)); 80 | CUDA_CHECK(cudaEventCreate(&stop)); 81 | 82 | // Launch the Vector Add CUDA Kernel 83 | int threadsPerBlock = 256; 84 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 85 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); 86 | 87 | CUDA_CHECK(cudaDeviceSynchronize()); 88 | CUDA_CHECK(cudaEventRecord(start)); 89 | vecAddKernel<<>>(d_A, d_B, d_C, numElements); 90 | CUDA_CHECK(cudaDeviceSynchronize()); 91 | CUDA_CHECK(cudaEventRecord(stop)); 92 | 93 | CUDA_CHECK(cudaGetLastError()); 94 | 95 | // Copy the device result vector in device memory 96 | // to the host result vector in host memory 97 | printf("Copy output data from the CUDA device to the host memory\n"); 98 | CUDA_CHECK(cudaMemcpy(h_C, d_C, numElements * sizeof(float), cudaMemcpyDeviceToHost)); 99 | 100 | // Verify that the result vector is correct (sampling) 101 | printf("Verifying vector addition...\n"); 102 | for (int idx = 0; idx < numElements; idx++) { 103 | //printf("[INDEX %d] %f + %f = %f\n", idx, h_A[idx], h_B[idx], h_C[idx]); 104 | if (fabs(h_A[idx] + h_B[idx] - h_C[idx]) > 1e-5) { 105 | fprintf(stderr, "Result verification failed at element %d\n", idx); 106 | exit(EXIT_FAILURE); 107 | } 108 | } 109 | printf(".....\n"); 110 | printf("Test PASSED\n"); 111 | 112 | // Compute and Print the performance 113 | float msecTotal = 0.0f; 114 | CUDA_CHECK(cudaEventElapsedTime(&msecTotal, start, stop)); 115 | double flopsPerVecAdd = static_cast(numElements); 116 | double gigaFlops = (flopsPerVecAdd * 1.0e-9f) / (msecTotal / 1000.0f); 117 | printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size = %.0f Ops, " 118 | "WorkgroupSize= %u threads/block\n", 119 | gigaFlops, msecTotal, flopsPerVecAdd, threadsPerBlock); 120 | 121 | // Free device global memory 122 | CUDA_CHECK(cudaFree(d_A)); 123 | CUDA_CHECK(cudaFree(d_B)); 124 | CUDA_CHECK(cudaFree(d_C)); 125 | CUDA_CHECK(cudaEventDestroy(start)); 126 | CUDA_CHECK(cudaEventDestroy(stop)); 127 | } 128 | 129 | __global__ 130 | void vecAddKernel(const float *A, const float *B, float *C, int numElements) 131 | { 132 | int i = blockIdx.x * blockDim.x + threadIdx.x; 133 | 134 | if (i < numElements) 135 | C[i] = A[i] + B[i]; 136 | } -------------------------------------------------------------------------------- /OpenMP/00_omp_hello.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 00_omp_hello.c 3 | * Purpose: A parallel hello, world program that uses OpenMP 4 | * Compile: gcc -Wall -fopenmp -o 00_omp_hello 00_omp_hello.c 5 | * Run: ./00_omp_hello 6 | * 7 | * Input: none 8 | * Output: A message from each thread 9 | *****************************************************************************/ 10 | #include 11 | #include 12 | #include 13 | 14 | /* thread function */ 15 | void Hello(void); 16 | 17 | int main(int argc, char* argv[]) 18 | { 19 | int thread_count = strtol(argv[1], NULL, 10); 20 | 21 | #pragma omp parallel// num_threads(thread_count) 22 | Hello(); 23 | 24 | return 0; 25 | } 26 | 27 | /***************************************************************************** 28 | * Function: Hello 29 | * Purpose: Thread function that prints message 30 | *****************************************************************************/ 31 | void Hello(void) 32 | { 33 | int my_rank = omp_get_thread_num(); 34 | int thread_count = omp_get_num_threads(); 35 | 36 | printf("Hello from thread %d of %d\n", my_rank, thread_count); 37 | } -------------------------------------------------------------------------------- /OpenMP/01_omp_hello_errchk.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 01_omp_hello_errchk.c 3 | * Purpose: A parallel hello, world program that uses OpenMP 4 | * Compile: gcc -Wall -fopenmp -o 01_omp_hello_errchk 01_omp_hello_errchk.c 5 | * Run: ./01_omp_hello_errchk 6 | * 7 | * Input: none 8 | * Output: A message from each thread 9 | *****************************************************************************/ 10 | #include 11 | #include 12 | #ifdef _OPENMP 13 | #include 14 | #endif 15 | 16 | void Usage(char* prog_name); 17 | void Hello(int thread_count); /* thread function */ 18 | 19 | int main(int argc, char* argv[]) 20 | { 21 | #ifdef _OPENMP 22 | printf("_OPENMP = %d\n", _OPENMP); 23 | #endif 24 | int thread_count; 25 | 26 | if (argc != 2) 27 | Usage(argv[0]); 28 | thread_count = strtol(argv[1], NULL, 10); 29 | if (thread_count <= 0) 30 | Usage(argv[0]); 31 | 32 | #pragma omp parallel num_threads(thread_count) 33 | Hello(thread_count); 34 | 35 | return 0; 36 | } 37 | 38 | /***************************************************************************** 39 | * Function: Usage 40 | * Purpose: Print a message indicating how program should be started 41 | * and terminate. 42 | *****************************************************************************/ 43 | void Usage(char* prog_name) 44 | { 45 | fprintf(stderr, "Usage: %s \n", prog_name); 46 | fprintf(stderr, " thread_count should be positive\n"); 47 | exit(0); 48 | } 49 | 50 | /***************************************************************************** 51 | * Function: Hello 52 | * Purpose: Thread function that prints message 53 | *****************************************************************************/ 54 | void Hello(int thread_count) 55 | { 56 | #ifdef _OPENMP 57 | int my_rank = omp_get_thread_num(); 58 | int actual_thread_count = omp_get_num_threads(); 59 | #else 60 | int my_rank = 0; 61 | int actual_thread_count = 1; 62 | #endif 63 | 64 | if (my_rank == 0 && thread_count != actual_thread_count) 65 | fprintf(stderr, "Number of threads started != %d\n", thread_count); 66 | printf("Hello from thread %d of %d\n", my_rank, actual_thread_count); 67 | } -------------------------------------------------------------------------------- /OpenMP/02_omp_trap1.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 02_omp_trap1.c 3 | * Purpose: Estimate definite integral (or area under curve) using 4 | * trapezoidal rule. 5 | * Compile: gcc -Wall -fopenmp -o 02_omp_trap1 02_omp_trap1.c 6 | * Run: ./02_omp_trap1 7 | * 8 | * Input: a, b, n 9 | * Output: estimate of integral from a to b of f(x) using n trapezoidals. 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | 15 | void Usage(char* prog_name); 16 | double f(double x); /* function we're integrating */ 17 | void Trap(double a, double b, int n, double* global_result_p); 18 | 19 | int main(int argc, char* argv[]) 20 | { 21 | if (argc != 2) 22 | Usage(argv[0]); 23 | 24 | int thread_count = strtol(argv[1], NULL, 10); 25 | 26 | double a, b; 27 | int n; 28 | printf("Enter a, b, and n\n"); 29 | scanf("%lf %lf %d", &a, &b, &n); 30 | if (n % thread_count != 0) 31 | Usage(argv[0]); 32 | 33 | double global_result = 0.0; 34 | #pragma omp parallel num_threads(thread_count) 35 | Trap(a, b, n, &global_result); 36 | 37 | printf("With n = %d trapezoids, our estimate\n", n); 38 | printf("of the integral from %f to %f = %f\n", a, b, global_result); 39 | 40 | return 0; 41 | } 42 | 43 | /***************************************************************************** 44 | * Function: Usage 45 | * Purpose: Print a message indicating how program should be started 46 | * and terminate. 47 | *****************************************************************************/ 48 | void Usage(char* prog_name) 49 | { 50 | fprintf(stderr, "Usage: %s \n", prog_name); 51 | fprintf(stderr, " number of trapezoids must be evenly divisible by number of threads\n"); 52 | exit(0); 53 | } 54 | 55 | /***************************************************************************** 56 | * Function: f 57 | * Purpose: Compute value of function to be integrated 58 | * Input arg: x 59 | * Return val: f(x) 60 | *****************************************************************************/ 61 | double f(double x) 62 | { 63 | return x*x; 64 | } 65 | 66 | /***************************************************************************** 67 | * Function: Trap 68 | * Purpose: Use trapezoidal rule to estimate definite integral 69 | * Input arg: 70 | * a: left endpoint 71 | * b: right endpoint 72 | * n: number of trapezoids 73 | * Output arg: 74 | * integral: estimate of integral from a to b of f(x) 75 | *****************************************************************************/ 76 | void Trap(double a, double b, int n, double* p_global_result) 77 | { 78 | double h, local_a, local_b; 79 | int local_n; 80 | int my_rank = omp_get_thread_num(); 81 | int thread_count = omp_get_num_threads(); 82 | 83 | h = (b-a)/n; 84 | local_n = n/thread_count; 85 | local_a = a + my_rank*local_n*h; 86 | local_b = local_a + local_n*h; 87 | 88 | double my_result = (f(local_a) + f(local_b))/2.0; 89 | for (int i = 1; i < local_n; i++) 90 | my_result += f(local_a + i*h); 91 | my_result = my_result*h; 92 | 93 | #pragma omp critical 94 | *p_global_result += my_result; 95 | } -------------------------------------------------------------------------------- /OpenMP/03_omp_trap2.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 03_omp_trap2.c 3 | * Purpose: Estimate definite integral (or area under curve) using 4 | * trapezoidal rule. This version uses a hand-coded reduction 5 | * after the function call. 6 | * Compile: gcc -Wall -fopenmp -o 03_omp_trap2 03_omp_trap2.c 7 | * Run: ./03_omp_trap2 8 | * 9 | * Input: a, b, n 10 | * Output: estimate of integral from a to b of f(x) using n trapezoidals. 11 | *****************************************************************************/ 12 | #include 13 | #include 14 | #include 15 | 16 | void Usage(char* prog_name); 17 | double f(double x); /* function we're integrating */ 18 | double Local_trap(double a, double b, int n); 19 | 20 | int main(int argc, char* argv[]) 21 | { 22 | if (argc != 2) 23 | Usage(argv[0]); 24 | 25 | int thread_count = strtol(argv[1], NULL, 10); 26 | 27 | double a, b; 28 | int n; 29 | printf("Enter a, b, and n\n"); 30 | scanf("%lf %lf %d", &a, &b, &n); 31 | if (n % thread_count != 0) 32 | Usage(argv[0]); 33 | double start, finish; 34 | start = omp_get_wtime(); 35 | double global_result = 0.0; 36 | #pragma omp parallel num_threads(thread_count) 37 | { 38 | double my_result = 0.0; 39 | my_result += Local_trap(a, b, n); 40 | #pragma omp critical 41 | global_result += my_result; 42 | } 43 | finish = omp_get_wtime(); 44 | 45 | printf("With n = %d trapezoids, our estimate\n", n); 46 | printf("of the integral from %f to %f = %f\n", a, b, global_result); 47 | printf("Elapsed time = %f seconds\n", finish-start); 48 | 49 | return 0; 50 | } 51 | 52 | /***************************************************************************** 53 | * Function: Usage 54 | * Purpose: Print a message indicating how program should be started 55 | * and terminate. 56 | *****************************************************************************/ 57 | void Usage(char* prog_name) 58 | { 59 | fprintf(stderr, "Usage: %s \n", prog_name); 60 | fprintf(stderr, " number of trapezoids must be evenly divisible by number of threads\n"); 61 | exit(0); 62 | } 63 | 64 | /***************************************************************************** 65 | * Function: f 66 | * Purpose: Compute value of function to be integrated 67 | * Input arg: x 68 | * Return val: f(x) 69 | *****************************************************************************/ 70 | double f(double x) 71 | { 72 | return x*x; 73 | } 74 | 75 | /***************************************************************************** 76 | * Function: Trap 77 | * Purpose: Use trapezoidal rule to estimate part of a definite 78 | * integral 79 | * Input arg: 80 | * a: left endpoint 81 | * b: right endpoint 82 | * n: number of trapezoids 83 | * Return: estimate of integral from local_a to local_b 84 | *****************************************************************************/ 85 | double Local_trap(double a, double b, int n) 86 | { 87 | double h, local_a, local_b; 88 | int local_n; 89 | int my_rank = omp_get_thread_num(); 90 | int thread_count = omp_get_num_threads(); 91 | 92 | h = (b-a)/n; 93 | local_n = n/thread_count; 94 | local_a = a + my_rank*local_n*h; 95 | local_b = local_a + local_n*h; 96 | 97 | double my_result = (f(local_a) + f(local_b))/2.0; 98 | for (int i = 1; i < local_n; i++) 99 | my_result += f(local_a + i*h); 100 | my_result = my_result*h; 101 | 102 | return my_result; 103 | } -------------------------------------------------------------------------------- /OpenMP/04_omp_trap3.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 04_omp_trap3.c 3 | * Purpose: Estimate definite integral (or area under curve) using 4 | * trapezoidal rule. This version uses a reduction clause. 5 | * Compile: gcc -Wall -fopenmp -o 04_omp_trap3 04_omp_trap3.c 6 | * Run: ./04_omp_trap3 7 | * 8 | * Input: a, b, n 9 | * Output: estimate of integral from a to b of f(x) using n trapezoidals. 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | 15 | void Usage(char* prog_name); 16 | double f(double x); /* function we're integrating */ 17 | double Local_trap(double a, double b, int n); 18 | 19 | int main(int argc, char* argv[]) 20 | { 21 | if (argc != 2) 22 | Usage(argv[0]); 23 | 24 | int thread_count = strtol(argv[1], NULL, 10); 25 | 26 | double a, b; 27 | int n; 28 | printf("Enter a, b, and n\n"); 29 | scanf("%lf %lf %d", &a, &b, &n); 30 | if (n % thread_count != 0) 31 | Usage(argv[0]); 32 | 33 | double global_result = 0.0; 34 | #pragma omp parallel num_threads(thread_count) \ 35 | reduction(+: global_result) 36 | global_result += Local_trap(a, b, n); 37 | 38 | printf("With n = %d trapezoids, our estimate\n", n); 39 | printf("of the integral from %f to %f = %f\n", a, b, global_result); 40 | 41 | return 0; 42 | } 43 | 44 | /***************************************************************************** 45 | * Function: Usage 46 | * Purpose: Print a message indicating how program should be started 47 | * and terminate. 48 | *****************************************************************************/ 49 | void Usage(char* prog_name) 50 | { 51 | fprintf(stderr, "Usage: %s \n", prog_name); 52 | fprintf(stderr, " number of trapezoids must be evenly divisible by number of threads\n"); 53 | exit(0); 54 | } 55 | 56 | /***************************************************************************** 57 | * Function: f 58 | * Purpose: Compute value of function to be integrated 59 | * Input arg: x 60 | * Return val: f(x) 61 | *****************************************************************************/ 62 | double f(double x) 63 | { 64 | return x*x; 65 | } 66 | 67 | /***************************************************************************** 68 | * Function: Trap 69 | * Purpose: Use trapezoidal rule to estimate part of a definite 70 | * integral 71 | * Input arg: 72 | * a: left endpoint 73 | * b: right endpoint 74 | * n: number of trapezoids 75 | * Return: estimate of integral from local_a to local_b 76 | *****************************************************************************/ 77 | double Local_trap(double a, double b, int n) 78 | { 79 | double h, local_a, local_b; 80 | int local_n; 81 | int my_rank = omp_get_thread_num(); 82 | int thread_count = omp_get_num_threads(); 83 | 84 | h = (b-a)/n; 85 | local_n = n/thread_count; 86 | local_a = a + my_rank*local_n*h; 87 | local_b = local_a + local_n*h; 88 | 89 | double my_result = (f(local_a) + f(local_b))/2.0; 90 | for (int i = 1; i < local_n; i++) 91 | my_result += f(local_a + i*h); 92 | my_result = my_result*h; 93 | 94 | return my_result; 95 | } -------------------------------------------------------------------------------- /OpenMP/05_omp_trap4.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 05_omp_trap4.c 3 | * Purpose: Estimate definite integral (or area under curve) using 4 | * trapezoidal rule. This version uses a parallel for directive. 5 | * Compile: gcc -Wall -fopenmp -o 05_omp_trap4 05_omp_trap4.c 6 | * Run: ./05_omp_trap4 7 | * 8 | * Input: a, b, n 9 | * Output: estimate of integral from a to b of f(x) using n trapezoidals. 10 | * 11 | * Note: In this version, it's not necessary for n to be evenly divisible 12 | * by thread_count 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include 17 | 18 | void Usage(char* prog_name); 19 | double f(double x); /* function we're integrating */ 20 | double Trap(double a, double b, int n, int thread_count); 21 | 22 | int main(int argc, char* argv[]) 23 | { 24 | if (argc != 2) 25 | Usage(argv[0]); 26 | 27 | int thread_count = strtol(argv[1], NULL, 10); 28 | 29 | double a, b; 30 | int n; 31 | printf("Enter a, b, and n\n"); 32 | scanf("%lf %lf %d", &a, &b, &n); 33 | 34 | double global_result = 0.0; 35 | global_result = Trap(a, b, n, thread_count); 36 | 37 | printf("With n = %d trapezoids, our estimate\n", n); 38 | printf("of the integral from %f to %f = %f\n", a, b, global_result); 39 | 40 | return 0; 41 | } 42 | 43 | /***************************************************************************** 44 | * Function: Usage 45 | * Purpose: Print a message indicating how program should be started 46 | * and terminate. 47 | *****************************************************************************/ 48 | void Usage(char* prog_name) 49 | { 50 | fprintf(stderr, "Usage: %s \n", prog_name); 51 | fprintf(stderr, " number of trapezoids must be evenly divisible by number of threads\n"); 52 | exit(0); 53 | } 54 | 55 | /***************************************************************************** 56 | * Function: f 57 | * Purpose: Compute value of function to be integrated 58 | * Input arg: x 59 | * Return val: f(x) 60 | *****************************************************************************/ 61 | double f(double x) 62 | { 63 | return x*x; 64 | } 65 | 66 | /***************************************************************************** 67 | * Function: Trap 68 | * Purpose: Use trapezoidal rule to estimate part of a definite 69 | * integral 70 | * Input arg: 71 | * a: left endpoint 72 | * b: right endpoint 73 | * n: number of trapezoids 74 | * Return: 75 | * approx: estimate of integral from a to b of f(x) 76 | *****************************************************************************/ 77 | double Trap(double a, double b, int n, int thread_count) 78 | { 79 | double h, approx; 80 | 81 | h = (b-a)/n; 82 | approx = (f(a) + f(b))/2.0; 83 | #pragma omp parallel for num_threads(thread_count) \ 84 | reduction(+: approx) 85 | for (int i = 1; i < n; i++) 86 | approx += f(a + i*h); 87 | approx = approx*h; 88 | 89 | return approx; 90 | } -------------------------------------------------------------------------------- /OpenMP/06_omp_fibo.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 06_omp_fibo.c 3 | * Purpose: Try to compute n Fibonacci number using OpenMP. 4 | * Show what happens if we try to parallelize a loop with 5 | * dependences among the iterations. The program has a serious bug. 6 | * Compile: gcc -Wall -fopenmp -o 06_omp_fibo 06_omp_fibo.c 7 | * Run: ./06_omp_fibo 8 | * 9 | * Input: none 10 | * Output: A list of Fibonacci numbers 11 | * 12 | * Note: If your output seems to be OK, try increasing the number of 13 | * threads and/or n. 14 | *****************************************************************************/ 15 | #include 16 | #include 17 | #include 18 | 19 | void Usage(char* prog_name); 20 | 21 | int main(int argc, char* argv[]) 22 | { 23 | int thread_count, n; 24 | long long* fibo; 25 | 26 | if (argc != 3) 27 | Usage(argv[0]); 28 | 29 | thread_count = strtol(argv[1], NULL, 10); 30 | n = strtol(argv[2], NULL, 10); 31 | 32 | fibo = (long long*)malloc(n * sizeof(long long)); 33 | fibo[0] = fibo[1] = 1; 34 | 35 | #pragma omp parallel for num_threads(thread_count) 36 | for (int i = 2; i < n; i++) 37 | fibo[i] = fibo[i-1] + fibo[i-2]; 38 | 39 | printf("The first n Fibonacci numbers:\n"); 40 | for (int i = 0; i < n; i++) 41 | printf("%d\t%lld\n", i, fibo[i]); 42 | 43 | free(fibo); 44 | 45 | return 0; 46 | } 47 | 48 | /***************************************************************************** 49 | * Function: Usage 50 | * Purpose: Print a message indicating how program should be started 51 | * and terminate. 52 | *****************************************************************************/ 53 | void Usage(char* prog_name) 54 | { 55 | fprintf(stderr, "Usage: %s 7 | * is the number of terms of the series to use 8 | * 9 | * Input: none 10 | * Output: The estimate of pi and the value of pi computed by the arctan 11 | * function in the math library 12 | *****************************************************************************/ 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | void Usage(char* prog_name); 19 | 20 | int main(int argc, char* argv[]) 21 | { 22 | int thread_count; 23 | long long n; 24 | 25 | if (argc != 3) 26 | Usage(argv[0]); 27 | thread_count = strtol(argv[1], NULL, 10); 28 | n = strtoll(argv[2], NULL, 10); 29 | if (thread_count < 1 || n < 1) 30 | Usage(argv[0]); 31 | 32 | double factor, sum = 0.0; 33 | #pragma omp parallel for num_threads(thread_count) \ 34 | reduction(+: sum) private(factor) 35 | for (int i = 0; i < n; i++) { 36 | factor = (i % 2 == 0) ? 1.0 : -1.0; 37 | sum += factor/(2*i + 1); 38 | #ifdef DEBUG 39 | printf("Thread %d > i = %d, my_sum = %f\n", omp_get_thread_num(), i, sum); 40 | #endif 41 | } 42 | 43 | sum = 4.0*sum; 44 | printf("With n = %lld terms and %d threads,\n", n, thread_count); 45 | printf(" Our estimate of pi = %.14f\n", sum); 46 | printf(" pi = %.14f\n", 4.0*atan(1.0)); 47 | 48 | return 0; 49 | } 50 | 51 | /***************************************************************************** 52 | * Function: Usage 53 | * Purpose: Print a message indicating how program should be started 54 | * and terminate. 55 | *****************************************************************************/ 56 | void Usage(char* prog_name) 57 | { 58 | fprintf(stderr, "Usage: %s \n", prog_name); 59 | fprintf(stderr, " thread_count is the number of threads >= 1\n"); 60 | fprintf(stderr, " n is the number of terms and should be >= 1\n"); 61 | exit(0); 62 | } -------------------------------------------------------------------------------- /OpenMP/13_omp_private.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 13_omp_private.c 3 | * Purpose: Print the value of a private variable at the beginning of 4 | * a parallel block and after the end of the block 5 | * Compile: gcc -Wall -fopenmp -o 13_omp_private 13_omp_private.c 6 | * Run: ./13_omp_private 7 | * 8 | * Input: none 9 | * Output: Value of int at various points in the program 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | 15 | int main(int argc, char* argv[]) 16 | { 17 | int x = 5; 18 | int thread_count = strtol(argv[1], NULL, 10); 19 | 20 | #pragma omp parallel num_threads(thread_count) \ 21 | private(x) 22 | { 23 | int my_rank = omp_get_thread_num(); 24 | printf("Thread %d > before initialization, x = %d\n", my_rank, x); 25 | x = 2*my_rank + 2; 26 | printf("Thread %d > after initialization, x = %d\n", my_rank, x); 27 | } 28 | printf("After parallel block, x = %d\n", x); 29 | 30 | return 0; 31 | } -------------------------------------------------------------------------------- /OpenMP/queue/queue.h: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: queue.h 3 | * Purpose: Header file for queue.c which implements a queue of messages or 4 | * pairs of ints (source + contents) as a linked list. 5 | *****************************************************************************/ 6 | #ifndef _QUEUE_H_ 7 | #define _QUEUE_H_ 8 | #ifdef USE_OMP_LOCK 9 | #include 10 | #endif 11 | 12 | typedef struct queue_node_s { 13 | int src; 14 | int msg; 15 | struct queue_node_s* next_p; 16 | } QNode; 17 | 18 | typedef struct queue_s { 19 | #ifdef USE_OMP_LOCK 20 | omp_lock_t lock; 21 | #endif 22 | int enqueued; 23 | int dequeued; 24 | QNode* front_p; 25 | QNode* tail_p; 26 | } Queue; 27 | 28 | Queue* Allocate_queue(void); 29 | void Free_queue(Queue* q); 30 | void Print_queue(Queue* q); 31 | void Enqueue(Queue* q, int src, int msg); 32 | int Dequeue(Queue* q, int* src, int* msg); 33 | int Search(Queue* q, int msg, int* src); 34 | 35 | #endif -------------------------------------------------------------------------------- /cblas_mat_mul.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: cblas_mat_mul.c 3 | * Purpose: Compute a matrix-matrix product by using OpenBLAS library. 4 | * Compile: g++ -Wall -o cblas_mat_mul cblas_mat_mul.c -lopenblas 5 | * Run: ./cblas_mat_mul 6 | * : the rows of matrix A 7 | * : the columns of matrix A and the rows of matrix B 8 | * : the columns of matrix B 9 | * 10 | * Input: A, B 11 | * Output: 12 | * C: the product matrix, C = AB 13 | * Elapsed time each multiplication and average elapsed time of 14 | * 100 multiplications 15 | *****************************************************************************/ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #define GET_TIME(now) { \ 23 | struct timeval t; \ 24 | gettimeofday(&t, NULL); \ 25 | now = t.tv_sec + t.tv_usec/1000000.0; \ 26 | } 27 | 28 | const int RMAX = 1000000; 29 | #ifdef DEBUG 30 | const int NCOUNT = 1; // number of multiplication 31 | #else 32 | const int NCOUNT = 100; // number of multiplication 33 | #endif 34 | 35 | void Get_args(int argc, char* argv[], int* m, int* n, int* k); 36 | void Usage(char* prog_name); 37 | void Generate_matrix(double mat[], int m, int n); 38 | void Print_matrix(double mat[], int m, int n, char* title); 39 | 40 | int main(int argc, char* argv[]) 41 | { 42 | int m, n, k; 43 | Get_args(argc, argv, &m, &n, &k); 44 | 45 | double *A, *B, *C; 46 | A = (double*)malloc(m * n * sizeof(double)); 47 | B = (double*)malloc(n * k * sizeof(double)); 48 | C = (double*)malloc(m * k * sizeof(double)); 49 | 50 | Generate_matrix(A, m, n); 51 | Generate_matrix(B, n, k); 52 | #ifdef DEBUG 53 | Print_matrix(A, m, n, "A"); 54 | Print_matrix(B, n, k, "B"); 55 | #endif 56 | 57 | double start, finish, avg_elapsed = 0.0; 58 | for (int count = 0; count < NCOUNT; count++) { 59 | GET_TIME(start); 60 | cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, k, n, 1.0, A, n, B, k, 0, C, k); 61 | GET_TIME(finish); 62 | 63 | printf("[%3d] Elapsed time = %.6f seconds\n", count+1, finish-start); 64 | avg_elapsed += (finish - start) / NCOUNT; 65 | } 66 | 67 | #ifdef DEBUG 68 | Print_matrix(C, m, k, "The product is"); 69 | #endif 70 | 71 | printf("Average elapsed time = %.6f seconds\n", avg_elapsed); 72 | 73 | free(A); 74 | free(B); 75 | free(C); 76 | 77 | return 0; 78 | } 79 | 80 | /***************************************************************************** 81 | * Function: Get_args 82 | * Purpose: Get and check command list arguments 83 | * In args: argc, argv 84 | * Out args: m, n, k 85 | *****************************************************************************/ 86 | void Get_args(int argc, char* argv[], int* m, int* n, int* k) 87 | { 88 | if (argc != 4) 89 | Usage(argv[0]); 90 | 91 | *m = strtol(argv[1], NULL, 10); 92 | *n = strtol(argv[2], NULL, 10); 93 | *k = strtol(argv[3], NULL, 10); 94 | if (*m <= 0 || *n <= 0 || *k <= 0) 95 | Usage(argv[0]); 96 | } 97 | 98 | /***************************************************************************** 99 | * Function: Usage 100 | * Purpose: Print a message indicating how program should be started 101 | * and terminate. 102 | * In arg: prog_name 103 | *****************************************************************************/ 104 | void Usage(char* prog_name) 105 | { 106 | fprintf(stderr, "Usage: %s \n", prog_name); 107 | exit(0); 108 | } 109 | 110 | /***************************************************************************** 111 | * Function: Generate_matrix 112 | * Purpose: Generate matrix entries by using the random number generator 113 | * In args: m, n 114 | * Out arg: mat 115 | *****************************************************************************/ 116 | void Generate_matrix(double mat[], int m, int n) 117 | { 118 | for (int i = 0; i < m; i++) 119 | for (int j = 0; j < n; j++) 120 | mat[i*n + j] = (rand() % RMAX) / (RMAX / 10.0); 121 | } 122 | 123 | /***************************************************************************** 124 | * Function: Print_matrix 125 | * Purpose: Print the matrix 126 | * In args: mat, m, n, title 127 | *****************************************************************************/ 128 | void Print_matrix(double mat[], int m, int n, char* title) 129 | { 130 | printf("%s\n", title); 131 | for (int i = 0; i < m; i++) { 132 | for (int j = 0; j < n; j++) 133 | printf("%f ", mat[i*n + j]); 134 | printf("\n"); 135 | } 136 | } -------------------------------------------------------------------------------- /mkl_mat_mul.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: mkl_mat_mul.c 3 | * Purpose: Compute a matrix-matrix product by using Intel MKL library. 4 | * Compile: gcc -Wall -o mkl_mat_mul mkl_mat_mul.c $(pkg-config mkl-static-lp64-iomp --libs --cflags) 5 | * Run: ./mkl_mat_mul 6 | * : the rows of matrix A 7 | * : the columns of matrix A and the rows of matrix B 8 | * : the columns of matrix B 9 | * 10 | * Input: A, B 11 | * Output: 12 | * C: the product matrix, C = AB 13 | * Elapsed time each multiplication and average elapsed time of 14 | * 100 multiplications 15 | *****************************************************************************/ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #define GET_TIME(now) { \ 23 | struct timeval t; \ 24 | gettimeofday(&t, NULL); \ 25 | now = t.tv_sec + t.tv_usec/1000000.0; \ 26 | } 27 | 28 | const int RMAX = 1000000; 29 | #ifdef DEBUG 30 | const int NCOUNT = 1; // number of multiplication 31 | #else 32 | const int NCOUNT = 100; // number of multiplication 33 | #endif 34 | 35 | void Get_args(int argc, char* argv[], int* m, int* n, int* k); 36 | void Usage(char* prog_name); 37 | void Generate_matrix(double mat[], int m, int n); 38 | void Print_matrix(double mat[], int m, int n, char* title); 39 | 40 | int main(int argc, char* argv[]) 41 | { 42 | int m, n, k; 43 | Get_args(argc, argv, &m, &n, &k); 44 | 45 | double *A, *B, *C; 46 | A = (double*)mkl_malloc(m * n * sizeof(double), 64); 47 | B = (double*)mkl_malloc(n * k * sizeof(double), 64); 48 | C = (double*)mkl_malloc(m * k * sizeof(double), 64); 49 | 50 | Generate_matrix(A, m, n); 51 | Generate_matrix(B, n, k); 52 | #ifdef DEBUG 53 | Print_matrix(A, m, n, "A"); 54 | Print_matrix(B, n, k, "B"); 55 | #endif 56 | 57 | 58 | double start, finish, avg_elapsed = 0.0; 59 | for (int count = 0; count < NCOUNT; count++) { 60 | GET_TIME(start); 61 | cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, k, n, 1.0, A, n, B, k, 0, C, k); 62 | GET_TIME(finish); 63 | 64 | printf("[%3d] Elapsed time = %.6f seconds\n", count+1, finish-start); 65 | avg_elapsed += (finish - start) / NCOUNT; 66 | } 67 | 68 | #ifdef DEBUG 69 | Print_matrix(C, m, k, "The product is"); 70 | #endif 71 | 72 | printf("Average elapsed time = %.6f seconds\n", avg_elapsed); 73 | 74 | mkl_free(A); 75 | mkl_free(B); 76 | mkl_free(C); 77 | 78 | return 0; 79 | } 80 | 81 | /***************************************************************************** 82 | * Function: Get_args 83 | * Purpose: Get and check command list arguments 84 | * In args: argc, argv 85 | * Out args: m, n, k 86 | *****************************************************************************/ 87 | void Get_args(int argc, char* argv[], int* m, int* n, int* k) 88 | { 89 | if (argc != 4) 90 | Usage(argv[0]); 91 | 92 | *m = strtol(argv[1], NULL, 10); 93 | *n = strtol(argv[2], NULL, 10); 94 | *k = strtol(argv[3], NULL, 10); 95 | if (*m <= 0 || *n <= 0 || *k <= 0) 96 | Usage(argv[0]); 97 | } 98 | 99 | /***************************************************************************** 100 | * Function: Usage 101 | * Purpose: Print a message indicating how program should be started 102 | * and terminate. 103 | * In arg: prog_name 104 | *****************************************************************************/ 105 | void Usage(char* prog_name) 106 | { 107 | fprintf(stderr, "Usage: %s \n", prog_name); 108 | exit(0); 109 | } 110 | 111 | /***************************************************************************** 112 | * Function: Generate_matrix 113 | * Purpose: Generate matrix entries by using the random number generator 114 | * In args: m, n 115 | * Out arg: mat 116 | *****************************************************************************/ 117 | void Generate_matrix(double mat[], int m, int n) 118 | { 119 | for (int i = 0; i < m; i++) 120 | for (int j = 0; j < n; j++) 121 | mat[i*n + j] = (rand() % RMAX) / (RMAX / 10.0); 122 | } 123 | 124 | /***************************************************************************** 125 | * Function: Print_matrix 126 | * Purpose: Print the matrix 127 | * In args: mat, m, n, title 128 | *****************************************************************************/ 129 | void Print_matrix(double mat[], int m, int n, char* title) 130 | { 131 | printf("%s\n", title); 132 | for (int i = 0; i < m; i++) { 133 | for (int j = 0; j < n; j++) 134 | printf("%f ", mat[i*n + j]); 135 | printf("\n"); 136 | } 137 | } -------------------------------------------------------------------------------- /mpi/00_mpi_hello.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | const int MAX_STRING = 100; 6 | 7 | int main(void) 8 | { 9 | char greeting[MAX_STRING]; 10 | int comm_sz; 11 | int my_rank; 12 | 13 | MPI_Init(NULL, NULL); 14 | MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 15 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 16 | 17 | if (my_rank != 0) { 18 | sprintf(greeting, "Greetings from process %d of %d!", my_rank, comm_sz); 19 | MPI_Send(greeting, strlen(greeting) + 1, MPI_CHAR, 0, 0, MPI_COMM_WORLD); 20 | } 21 | else { 22 | printf("Greetings from process %d of %d!\n", my_rank, comm_sz); 23 | for (int q = 1; q < comm_sz; q++) { 24 | MPI_Recv(greeting, MAX_STRING, MPI_CHAR, q, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 25 | printf("%s\n", greeting); 26 | } 27 | } 28 | 29 | MPI_Finalize(); 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /mpi/01_serial_trap.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: 01_serial_trap.c 3 | * Purpose: Calculate area by using trapezoidal rule 4 | * Compile: gcc -Wall -o 01_serial_trap 01_serial_trap.c 5 | * Run: 6 | * 01_serial_trap 7 | * - a: left end-point 8 | * - b: right end-point 9 | * - n: the number of subinterval 10 | */ 11 | #include 12 | #include 13 | 14 | double f(double x); 15 | double Trap(double a, double b, int n, double h); 16 | 17 | int main(int argc, char** argv) 18 | { 19 | double integral; 20 | double a, b; 21 | int n; 22 | double h; 23 | 24 | if (argc != 4) { 25 | fprintf(stderr, "usage: %s \n", argv[0]); 26 | fprintf(stderr, " a: left end-point\n"); 27 | fprintf(stderr, " b: right end-point\n"); 28 | fprintf(stderr, " n: the number of subinterval\n"); 29 | exit(-1); 30 | } 31 | 32 | a = atof(argv[1]); 33 | b = atof(argv[2]); 34 | n = atoi(argv[3]); 35 | 36 | h = (b-a)/n; 37 | integral = Trap(a, b, n, h); 38 | 39 | printf("With n = %d trapezoids, our estimate\n", n); 40 | printf("of the integral from %f to %f = %.15f\n", a, b, integral); 41 | 42 | return 0; 43 | } 44 | 45 | double Trap(double a, double b, int n, double h) 46 | { 47 | double integral; 48 | 49 | integral = (f(a) + f(b)) / 2.0; 50 | 51 | for(int k = 0; k < n; k++) { 52 | integral += f(a + k*h); 53 | } 54 | integral = integral * h; 55 | 56 | return integral; 57 | } 58 | 59 | double f(double x) 60 | { 61 | return x*x; 62 | } -------------------------------------------------------------------------------- /mpi/02_mpi_trap1.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: 02_mpi_trap1.c 3 | * Purpose: Use MPI to implement a parallel version of the trapezoidal rule. 4 | * In this version the endpoints of the interval and 5 | * the number of trapezoids are hardwired. 6 | * Compile: mpicc -Wall -o 02_mpi_trap1 02_mpi_trap1.c 7 | * Run: mpiexec -n ./02_mpi_trap1 8 | * 9 | * Algorithm: 10 | * 1. Each process calculates "its" interval of 11 | * integration. 12 | * 2. Each process estimates the integral of f(x) 13 | * over its interval using the trapezoidal rule. 14 | * 3a. Each process != 0 sends its integral to 0. 15 | * 3b. Process 0 sums the calculations received from 16 | * the individual processes and prints the result. 17 | */ 18 | #include 19 | #include 20 | #include 21 | 22 | double Trap(double a, double b, int n, double h); 23 | double f(double x); 24 | 25 | int main(void) 26 | { 27 | int my_rank, comm_sz, n = 1024, local_n; 28 | double a = 0.0, b = 3.0, h, local_a, local_b, local_int, total_int; 29 | 30 | MPI_Init(NULL, NULL); 31 | MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 32 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 33 | 34 | h = (b-a)/n; 35 | local_n = n/comm_sz; 36 | 37 | local_a = a + my_rank*local_n*h; 38 | local_b = local_a + local_n*h; 39 | local_int = Trap(local_a, local_b, local_n, h); 40 | 41 | if (my_rank != 0) { 42 | MPI_Send(&local_int, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); 43 | } 44 | else { 45 | total_int = local_int; 46 | for (int source = 1; source < comm_sz; source++) { 47 | MPI_Recv(&local_int, 1, MPI_DOUBLE, source, 0, 48 | MPI_COMM_WORLD, MPI_STATUS_IGNORE); 49 | total_int += local_int; 50 | } 51 | } 52 | 53 | if (my_rank == 0) { 54 | printf("With n = %d trapezoids, our estimate\n", n); 55 | printf("of the integral from %f to %f = %.15f\n", a, b, total_int); 56 | } 57 | 58 | MPI_Finalize(); 59 | 60 | return 0; 61 | } 62 | 63 | double Trap(double a, double b, int n, double h) 64 | { 65 | double integral; 66 | 67 | integral = (f(a) + f(b)) / 2.0; 68 | 69 | for(int k = 0; k < n; k++) { 70 | integral += f(a + k*h); 71 | } 72 | integral = integral * h; 73 | 74 | return integral; 75 | } 76 | 77 | double f(double x) 78 | { 79 | return x*x; 80 | } -------------------------------------------------------------------------------- /mpi/03_mpi_output.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: 03_mpi_output.c 3 | * Purpose: A program in which multiple MPI processes try to print a message. 4 | * Compile: mpicc -Wall -o 03_mpi_output 03_mpi_output.c 5 | * Run: mpiexec -n ./03_mpi_output 6 | */ 7 | #include 8 | #include 9 | 10 | int main(void) 11 | { 12 | int my_rank, comm_sz; 13 | 14 | MPI_Init(NULL, NULL); 15 | MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 16 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 17 | 18 | printf("Proc %d of %d > Done anyone have a toothpick?\n", my_rank, comm_sz); 19 | 20 | MPI_Finalize(); 21 | 22 | return 0; 23 | } -------------------------------------------------------------------------------- /mpi/04_mpi_trap2.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: 04_mpi_trap2.c 3 | * Purpose: Use MPI to implement a parallel version of the trapezoidal rule. 4 | * This version accepts input of the endpoints of the interval and 5 | * the number of trapezoids. 6 | * Compile: mpicc -Wall -o 04_mpi_trap2 04_mpi_trap2.c 7 | * Run: mpiexec -n ./04_mpi_trap2 8 | * 9 | * Algorithm: 10 | * 1. Each process calculates "its" interval of 11 | * integration. 12 | * 2. Each process estimates the integral of f(x) 13 | * over its interval using the trapezoidal rule. 14 | * 3a. Each process != 0 sends its integral to 0. 15 | * 3b. Process 0 sums the calculations received from 16 | * the individual processes and prints the result. 17 | */ 18 | #include 19 | #include 20 | #include 21 | 22 | double Trap(double a, double b, int n, double h); 23 | double f(double x); 24 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n); 25 | 26 | int main(void) 27 | { 28 | int my_rank, comm_sz, n, local_n; 29 | double a, b, h, local_a, local_b, local_int, total_int; 30 | 31 | MPI_Init(NULL, NULL); 32 | MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 33 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 34 | 35 | Get_input(my_rank, comm_sz, &a, &b, &n); 36 | 37 | h = (b-a)/n; 38 | local_n = n/comm_sz; 39 | 40 | local_a = a + my_rank*local_n*h; 41 | local_b = local_a + local_n*h; 42 | local_int = Trap(local_a, local_b, local_n, h); 43 | 44 | MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 45 | 46 | if (my_rank == 0) { 47 | printf("With n = %d trapezoids, our estimate\n", n); 48 | printf("of the integral from %f to %f = %.15f\n", a, b, total_int); 49 | } 50 | 51 | MPI_Finalize(); 52 | 53 | return 0; 54 | } 55 | 56 | double Trap(double a, double b, int n, double h) 57 | { 58 | double integral; 59 | 60 | integral = (f(a) + f(b)) / 2.0; 61 | 62 | for(int k = 0; k < n; k++) { 63 | integral += f(a + k*h); 64 | } 65 | integral = integral * h; 66 | 67 | return integral; 68 | } 69 | 70 | double f(double x) 71 | { 72 | return x*x; 73 | } 74 | 75 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n) 76 | { 77 | int dest; 78 | 79 | if (my_rank == 0) { 80 | printf("Enter a, b, and n\n"); 81 | scanf("%lf %lf %d", p_a, p_b, p_n); 82 | for (dest = 1; dest < comm_sz; dest++) { 83 | MPI_Send(p_a, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD); 84 | MPI_Send(p_b, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD); 85 | MPI_Send(p_n, 1, MPI_INT, dest, 0, MPI_COMM_WORLD); 86 | } 87 | } 88 | else { /* my_rank != 0 */ 89 | MPI_Recv(p_a, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, 90 | MPI_STATUS_IGNORE); 91 | MPI_Recv(p_b, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, 92 | MPI_STATUS_IGNORE); 93 | MPI_Recv(p_n, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, 94 | MPI_STATUS_IGNORE); 95 | } 96 | } -------------------------------------------------------------------------------- /mpi/05_mpi_trap3.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: 05_mpi_trap3.c 3 | * Purpose: Use MPI to implement a parallel version of the trapezoidal rule. 4 | * This version uses collective communications to distribute 5 | * the input data and compute the global sum. 6 | * Compile: mpicc -Wall -o 05_mpi_trap3 05_mpi_trap3.c 7 | * Run: mpiexec -n ./05_mpi_trap3 8 | * 9 | * Algorithm: 10 | * 1. Each process calculates "its" interval of 11 | * integration. 12 | * 2. Each process estimates the integral of f(x) 13 | * over its interval using the trapezoidal rule. 14 | * 3a. Each process != 0 sends its integral to 0. 15 | * 3b. Process 0 sums the calculations received from 16 | * the individual processes and prints the result. 17 | */ 18 | #include 19 | #include 20 | #include 21 | 22 | double Trap(double a, double b, int n, double h); 23 | double f(double x); 24 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n); 25 | 26 | int main(void) 27 | { 28 | int my_rank, comm_sz, n, local_n; 29 | double a, b, h, local_a, local_b, local_int, total_int; 30 | 31 | MPI_Init(NULL, NULL); 32 | MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 33 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 34 | 35 | Get_input(my_rank, comm_sz, &a, &b, &n); 36 | 37 | h = (b-a)/n; 38 | local_n = n/comm_sz; 39 | 40 | local_a = a + my_rank*local_n*h; 41 | local_b = local_a + local_n*h; 42 | local_int = Trap(local_a, local_b, local_n, h); 43 | 44 | MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 45 | 46 | if (my_rank == 0) { 47 | printf("With n = %d trapezoids, our estimate\n", n); 48 | printf("of the integral from %f to %f = %.15f\n", a, b, total_int); 49 | } 50 | 51 | MPI_Finalize(); 52 | 53 | return 0; 54 | } 55 | 56 | double Trap(double a, double b, int n, double h) 57 | { 58 | double integral; 59 | 60 | integral = (f(a) + f(b)) / 2.0; 61 | 62 | for(int k = 0; k < n; k++) { 63 | integral += f(a + k*h); 64 | } 65 | integral = integral * h; 66 | 67 | return integral; 68 | } 69 | 70 | double f(double x) 71 | { 72 | return x*x; 73 | } 74 | 75 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n) 76 | { 77 | if (my_rank == 0) { 78 | printf("Enter a, b, and n\n"); 79 | scanf("%lf %lf %d", p_a, p_b, p_n); 80 | } 81 | 82 | MPI_Bcast(p_a, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 83 | MPI_Bcast(p_b, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 84 | MPI_Bcast(p_n, 1, MPI_INT, 0, MPI_COMM_WORLD); 85 | } -------------------------------------------------------------------------------- /mpi/06_mpi_trap4.c: -------------------------------------------------------------------------------- 1 | /* 2 | * File: 06_mpi_trap4.c 3 | * Purpose: Use MPI to implement a parallel version of the trapezoidal rule. 4 | * This version uses collective communications and 5 | * MPI derived datatypes to distribute the input data and 6 | * compute the global sum. 7 | * Compile: mpicc -Wall -o 06_mpi_trap4 06_mpi_trap4.c 8 | * Run: mpiexec -n ./06_mpi_trap4 9 | * 10 | * Algorithm: 11 | * 1. Each process calculates "its" interval of 12 | * integration. 13 | * 2. Each process estimates the integral of f(x) 14 | * over its interval using the trapezoidal rule. 15 | * 3a. Each process != 0 sends its integral to 0. 16 | * 3b. Process 0 sums the calculations received from 17 | * the individual processes and prints the result. 18 | */ 19 | #include 20 | #include 21 | #include 22 | 23 | double Trap(double a, double b, int n, double h); 24 | double f(double x); 25 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n); 26 | void Build_mpi_type(double* p_a, double* p_b, int* p_n, MPI_Datatype* p_input_mpi_t); 27 | 28 | int main(void) 29 | { 30 | int my_rank, comm_sz, n, local_n; 31 | double a, b, h, local_a, local_b, local_int, total_int; 32 | 33 | MPI_Init(NULL, NULL); 34 | MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 35 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 36 | 37 | Get_input(my_rank, comm_sz, &a, &b, &n); 38 | 39 | h = (b-a)/n; 40 | local_n = n/comm_sz; 41 | 42 | local_a = a + my_rank*local_n*h; 43 | local_b = local_a + local_n*h; 44 | local_int = Trap(local_a, local_b, local_n, h); 45 | 46 | MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 47 | 48 | if (my_rank == 0) { 49 | printf("With n = %d trapezoids, our estimate\n", n); 50 | printf("of the integral from %f to %f = %.15f\n", a, b, total_int); 51 | } 52 | 53 | MPI_Finalize(); 54 | 55 | return 0; 56 | } 57 | 58 | double Trap(double a, double b, int n, double h) 59 | { 60 | double integral; 61 | 62 | integral = (f(a) + f(b)) / 2.0; 63 | 64 | for(int k = 0; k < n; k++) { 65 | integral += f(a + k*h); 66 | } 67 | integral = integral * h; 68 | 69 | return integral; 70 | } 71 | 72 | double f(double x) 73 | { 74 | return x*x; 75 | } 76 | 77 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n) 78 | { 79 | MPI_Datatype input_mpi_t; 80 | 81 | Build_mpi_type(p_a, p_b, p_n, &input_mpi_t); 82 | 83 | if (my_rank == 0) { 84 | printf("Enter a, b, and n\n"); 85 | scanf("%lf %lf %d", p_a, p_b, p_n); 86 | } 87 | 88 | MPI_Bcast(p_a, 1, input_mpi_t, 0, MPI_COMM_WORLD); 89 | 90 | MPI_Type_free(&input_mpi_t); 91 | } 92 | 93 | void Build_mpi_type(double* p_a, double* p_b, int* p_n, MPI_Datatype* p_input_mpi_t) 94 | { 95 | int array_of_blocklengths[3] = {1, 1, 1}; 96 | MPI_Datatype array_of_types[3] = {MPI_DOUBLE, MPI_DOUBLE, MPI_INT}; 97 | MPI_Aint a_addr, b_addr, n_addr; 98 | MPI_Aint array_of_displacements[3] = {0}; 99 | 100 | MPI_Get_address(p_a, &a_addr); 101 | MPI_Get_address(p_b, &b_addr); 102 | MPI_Get_address(p_n, &n_addr); 103 | 104 | array_of_displacements[1] = b_addr - a_addr; 105 | array_of_displacements[2] = n_addr - a_addr; 106 | 107 | MPI_Type_create_struct(3, array_of_blocklengths, array_of_displacements, 108 | array_of_types, p_input_mpi_t); 109 | MPI_Type_commit(p_input_mpi_t); 110 | } -------------------------------------------------------------------------------- /ocv_mat_mul.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: ocv_mat_mul.c 3 | * Purpose: Compute a matrix-matrix product by using OpenCV library. 4 | * Compile: g++ -Wall -o ocv_mat_mul ocv_mat_mul.c $(pkg-config opencv4 --libs --cflags) 5 | * Run: ./ocv_mat_mul 6 | * : the rows of matrix A 7 | * : the columns of matrix A and the rows of matrix B 8 | * : the columns of matrix B 9 | * 10 | * Input: A, B 11 | * Output: 12 | * C: the product matrix, C = AB 13 | * Elapsed time each multiplication and average elapsed time of 14 | * 100 multiplications 15 | *****************************************************************************/ 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define GET_TIME(now) { \ 22 | struct timeval t; \ 23 | gettimeofday(&t, NULL); \ 24 | now = t.tv_sec + t.tv_usec/1000000.0; \ 25 | } 26 | 27 | const int RMAX = 1000000; 28 | #ifdef DEBUG 29 | const int NCOUNT = 1; // number of multiplication 30 | #else 31 | const int NCOUNT = 100; // number of multiplication 32 | #endif 33 | 34 | void Get_args(int argc, char* argv[], int* m, int* n, int* k); 35 | void Usage(char* prog_name); 36 | void Generate_matrix(double mat[], int m, int n); 37 | void Print_matrix(double mat[], int m, int n, char* title); 38 | 39 | int main(int argc, char* argv[]) 40 | { 41 | int m, n, k; 42 | Get_args(argc, argv, &m, &n, &k); 43 | 44 | double *A, *B, *C; 45 | A = (double*)malloc(m * n * sizeof(double)); 46 | B = (double*)malloc(n * k * sizeof(double)); 47 | //C = (double*)malloc(m * k * sizeof(double)); 48 | 49 | Generate_matrix(A, m, n); 50 | Generate_matrix(B, n, k); 51 | #ifdef DEBUG 52 | Print_matrix(A, m, n, "A"); 53 | Print_matrix(B, n, k, "B"); 54 | #endif 55 | 56 | 57 | double start, finish, avg_elapsed = 0.0; 58 | cv::Mat cvC; 59 | for (int count = 0; count < NCOUNT; count++) { 60 | GET_TIME(start); 61 | cv::Mat cvA(m, n, CV_64FC1, A); 62 | cv::Mat cvB(n, k, CV_64FC1, B); 63 | cvC = cvA * cvB; 64 | //cv::gemm(cvA, cvB, 1.0, NULL, 0, cvC); 65 | //C = reinterpret_cast(cvC.data); 66 | GET_TIME(finish); 67 | 68 | printf("[%3d] Elapsed time = %.6f seconds\n", count+1, finish-start); 69 | avg_elapsed += (finish - start) / NCOUNT; 70 | } 71 | 72 | #ifdef DEBUG 73 | printf("The product is\n"); 74 | cv::print(cvC); 75 | printf("\n\n"); 76 | #endif 77 | 78 | printf("Average elapsed time = %.6f seconds\n", avg_elapsed); 79 | 80 | free(A); 81 | free(B); 82 | //free(C); 83 | 84 | return 0; 85 | } 86 | 87 | /***************************************************************************** 88 | * Function: Get_args 89 | * Purpose: Get and check command list arguments 90 | * In args: argc, argv 91 | * Out args: m, n, k 92 | *****************************************************************************/ 93 | void Get_args(int argc, char* argv[], int* m, int* n, int* k) 94 | { 95 | if (argc != 4) 96 | Usage(argv[0]); 97 | 98 | *m = strtol(argv[1], NULL, 10); 99 | *n = strtol(argv[2], NULL, 10); 100 | *k = strtol(argv[3], NULL, 10); 101 | if (*m <= 0 || *n <= 0 || *k <= 0) 102 | Usage(argv[0]); 103 | } 104 | 105 | /***************************************************************************** 106 | * Function: Usage 107 | * Purpose: Print a message indicating how program should be started 108 | * and terminate. 109 | * In arg: prog_name 110 | *****************************************************************************/ 111 | void Usage(char* prog_name) 112 | { 113 | fprintf(stderr, "Usage: %s \n", prog_name); 114 | exit(0); 115 | } 116 | 117 | /***************************************************************************** 118 | * Function: Generate_matrix 119 | * Purpose: Generate matrix entries by using the random number generator 120 | * In args: m, n 121 | * Out arg: mat 122 | *****************************************************************************/ 123 | void Generate_matrix(double mat[], int m, int n) 124 | { 125 | for (int i = 0; i < m; i++) 126 | for (int j = 0; j < n; j++) 127 | mat[i*n + j] = (rand() % RMAX) / (RMAX / 10.0); 128 | } 129 | 130 | /***************************************************************************** 131 | * Function: Print_matrix 132 | * Purpose: Print the matrix 133 | * In args: mat, m, n, title 134 | *****************************************************************************/ 135 | void Print_matrix(double mat[], int m, int n, char* title) 136 | { 137 | printf("%s\n", title); 138 | for (int i = 0; i < m; i++) { 139 | for (int j = 0; j < n; j++) 140 | printf("%f ", mat[i*n + j]); 141 | printf("\n"); 142 | } 143 | } -------------------------------------------------------------------------------- /pthread/00_pth_hello.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 00_pth_hello.c 3 | * Purpose: Illustrate basic use of threads: create some threads, 4 | * each of which prints a mssage. 5 | * Compile: gcc -Wall -o 00_pth_hello 00_pth_hello.c -pthread 6 | * Run: ./00_pth_hello 7 | * 8 | * Input: none 9 | * Output: message from each thread 10 | *****************************************************************************/ 11 | #include 12 | #include 13 | #include 14 | 15 | const int MAX_THREADS = 64; 16 | 17 | /* global variables: accesible to all threads */ 18 | int thread_count; 19 | 20 | void Usage(char* prog_name); 21 | void* Hello(void* rank); 22 | 23 | int main(int argc, char* argv[]) 24 | { 25 | if (argc != 2) { 26 | Usage(argv[0]); 27 | } 28 | 29 | /* Get number of threads from command line */ 30 | thread_count = strtol(argv[1], NULL, 10); 31 | if (thread_count <= 0 || thread_count > MAX_THREADS) { 32 | Usage(argv[0]); 33 | } 34 | 35 | pthread_t* thread_handles; 36 | thread_handles = malloc(thread_count*sizeof(pthread_t)); 37 | 38 | for (long thread = 0; thread < thread_count; thread++) { 39 | pthread_create(&thread_handles[thread], NULL, Hello, (void*)thread); 40 | } 41 | 42 | printf("Hello from the main thread\n"); 43 | 44 | for (long thread = 0; thread < thread_count; thread++) { 45 | pthread_join(thread_handles[thread], NULL); 46 | } 47 | 48 | free(thread_handles); 49 | 50 | return 0; 51 | } 52 | 53 | void Usage(char* prog_name) 54 | { 55 | fprintf(stderr, "Usage %s \n", prog_name); 56 | fprintf(stderr, "0 < number of threads <= %d\n", MAX_THREADS); 57 | exit(0); 58 | } 59 | 60 | void* Hello(void* rank) 61 | { 62 | long my_rank = (long)rank; 63 | 64 | printf("Hello from thread %ld of %d\n", my_rank, thread_count); 65 | 66 | return NULL; 67 | } -------------------------------------------------------------------------------- /pthread/02_pth_pi.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 02_pth_pi.c 3 | * Purpose: Estimate pi using serise 4 | * 5 | * pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . .] 6 | * 7 | * This version has a 'very serious bug' 8 | * 9 | * Compile: gcc -Wall -o 02_pth_pi 02_pth_pi.c -pthread [-lm] 10 | * Run: ./02_pth_pi 11 | * :the number of terms of the Maclarin series. It should be 12 | * evenly divisible by the number of threads 13 | * 14 | * Input: none 15 | * Output: The estimate of pi using multiple threads, one thread, and the 16 | * value computed by the math library arctan function. 17 | * Also elapsed times for the multithreaded and singlethreaded 18 | * computations. 19 | *****************************************************************************/ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #define GET_TIME(now) { \ 27 | struct timeval t; \ 28 | gettimeofday(&t, NULL); \ 29 | now = t.tv_sec + t.tv_usec/1000000.0; \ 30 | } 31 | 32 | const int MAX_THREADS = 1024; 33 | 34 | /* global variables */ 35 | long thread_count; 36 | long long n; 37 | double sum; 38 | 39 | void *Thread_sum(void* rank); 40 | 41 | void Get_args(int argc, char* argv[]); 42 | double Serial_pi(long long n); 43 | 44 | int main(int argc, char* argv[]) 45 | { 46 | pthread_t* thread_handles; 47 | 48 | Get_args(argc, argv); 49 | 50 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 51 | sum = 0.0; 52 | 53 | for (long thread = 0; thread < thread_count; thread++) 54 | pthread_create(&thread_handles[thread], NULL, Thread_sum, (void*)thread); 55 | 56 | for (long thread = 0; thread < thread_count; thread++) 57 | pthread_join(thread_handles[thread], NULL); 58 | 59 | sum *= 4.0; 60 | 61 | printf("With n = %lld terms,\n", n); 62 | printf(" Multi-threaded estimate of pi = %.15f\n", sum); 63 | 64 | sum = Serial_pi(n); 65 | printf(" Single-threaded estimate of pi = %.15f\n", sum); 66 | printf(" Math library estimate of pi = %.15f\n", 4.0*atan(1.0)); 67 | 68 | free(thread_handles); 69 | 70 | return 0; 71 | } 72 | 73 | /***************************************************************************** 74 | * Function: Thread_sum 75 | * Purpose: Add in the terms computed by the thread running this 76 | * In args: rank 77 | * Return: ignored(NULL) 78 | * Globals in: n, thread_count 79 | * Global in/out: sum 80 | *****************************************************************************/ 81 | void* Thread_sum(void* rank) 82 | { 83 | long my_rank = (long)rank; 84 | long long my_n = n / thread_count; 85 | long long my_first_i = my_n * my_rank; 86 | long long my_last_i = my_first_i + my_n; 87 | 88 | double factor; 89 | if (my_first_i % 2 == 0) 90 | factor = 1.0; 91 | else 92 | factor = -1.0; 93 | 94 | for (long long i = my_first_i; i < my_last_i; i++, factor = -factor) 95 | sum += factor/(2*i + 1); 96 | 97 | return NULL; 98 | } 99 | 100 | /***************************************************************************** 101 | * Function: Get_args 102 | * Purpose: Get and check command list arguments 103 | * In args: argc, argv 104 | * Globals out: thread_count, n 105 | *****************************************************************************/ 106 | void Get_args(int argc, char* argv[]) 107 | { 108 | int ok = 1; 109 | if (argc == 3) { 110 | thread_count = strtol(argv[1], NULL, 10); 111 | if (thread_count < 0 || thread_count > MAX_THREADS) 112 | ok = 0; 113 | 114 | n = strtoll(argv[2], NULL, 10); 115 | if (n <= 0) 116 | ok = 0; 117 | } 118 | else 119 | ok = 0; 120 | 121 | if (ok == 0) { 122 | fprintf(stderr, "Usage: %s \n", argv[0]); 123 | fprintf(stderr, " n is the number of terms and should be >= 1\n"); 124 | fprintf(stderr, " n should be evenly divisible by the number of threads\n"); 125 | exit(0); 126 | } 127 | } 128 | 129 | /***************************************************************************** 130 | * Function: Serial_pi 131 | * Purpose: Estimate pi using 1 thread 132 | * In args: n 133 | * Return: Estimate of pi using n terms of Maclaurin series 134 | *****************************************************************************/ 135 | double Serial_pi(long long n) 136 | { 137 | double sum = 0.0; 138 | double factor = 1.0; 139 | 140 | for (long long i = 0; i < n; i++, factor = -factor) 141 | sum += factor / (2*i + 1); 142 | 143 | return 4.0 * sum; 144 | } -------------------------------------------------------------------------------- /pthread/03_pth_pi_busy1.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 03_pth_pi_busy1.c 3 | * Purpose: Estimate pi using serise 4 | * 5 | * pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . .] 6 | * 7 | * This version uses busy-waiting to control access to the 8 | * critical section. 9 | * 10 | * Compile: gcc -Wall -o 03_pth_pi_busy1 03_pth_pi_busy1.c -pthread [-lm] 11 | * Run: ./03_pth_pi_busy1 12 | * :the number of terms of the Maclarin series. It should be 13 | * evenly divisible by the number of threads 14 | * 15 | * Input: none 16 | * Output: The estimate of pi using multiple threads, one thread, and the 17 | * value computed by the math library arctan function. 18 | * Also elapsed times for the multithreaded and singlethreaded 19 | * computations. 20 | *****************************************************************************/ 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #define GET_TIME(now) { \ 28 | struct timeval t; \ 29 | gettimeofday(&t, NULL); \ 30 | now = t.tv_sec + t.tv_usec/1000000.0; \ 31 | } 32 | 33 | const int MAX_THREADS = 1024; 34 | 35 | /* global variables */ 36 | long thread_count; 37 | long long n; 38 | double sum; 39 | int flag; 40 | 41 | void *Thread_sum(void* rank); 42 | 43 | void Get_args(int argc, char* argv[]); 44 | double Serial_pi(long long n); 45 | 46 | int main(int argc, char* argv[]) 47 | { 48 | pthread_t* thread_handles; 49 | 50 | Get_args(argc, argv); 51 | 52 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 53 | 54 | double start, finish; 55 | GET_TIME(start); 56 | sum = 0.0; 57 | flag = 0; 58 | for (long thread = 0; thread < thread_count; thread++) 59 | pthread_create(&thread_handles[thread], NULL, Thread_sum, (void*)thread); 60 | 61 | for (long thread = 0; thread < thread_count; thread++) 62 | pthread_join(thread_handles[thread], NULL); 63 | sum *= 4.0; 64 | GET_TIME(finish); 65 | 66 | printf("With n = %lld terms,\n", n); 67 | printf(" Multi-threaded estimate of pi = %.15f\n", sum); 68 | printf(" Elapsed time = %f seconds\n", finish-start); 69 | 70 | GET_TIME(start); 71 | sum = Serial_pi(n); 72 | GET_TIME(finish); 73 | printf(" Single-threaded estimate of pi = %.15f\n", sum); 74 | printf(" Elapsed time = %f seconds\n", finish-start); 75 | printf(" Math library estimate of pi = %.15f\n", 4.0*atan(1.0)); 76 | 77 | free(thread_handles); 78 | 79 | return 0; 80 | } 81 | 82 | /***************************************************************************** 83 | * Function: Thread_sum 84 | * Purpose: Add in the terms computed by the thread running this 85 | * In args: rank 86 | * Return: ignored(NULL) 87 | * Globals in: n, thread_count 88 | * Global in/out: sum 89 | *****************************************************************************/ 90 | void* Thread_sum(void* rank) 91 | { 92 | long my_rank = (long)rank; 93 | long long my_n = n / thread_count; 94 | long long my_first_i = my_n * my_rank; 95 | long long my_last_i = my_first_i + my_n; 96 | 97 | double factor; 98 | if (my_first_i % 2 == 0) 99 | factor = 1.0; 100 | else 101 | factor = -1.0; 102 | 103 | for (long long i = my_first_i; i < my_last_i; i++, factor = -factor) { 104 | while (flag != my_rank); 105 | sum += factor/(2*i + 1); 106 | flag = (flag + 1) % thread_count; 107 | } 108 | 109 | return NULL; 110 | } 111 | 112 | /***************************************************************************** 113 | * Function: Get_args 114 | * Purpose: Get and check command list arguments 115 | * In args: argc, argv 116 | * Globals out: thread_count, n 117 | *****************************************************************************/ 118 | void Get_args(int argc, char* argv[]) 119 | { 120 | int ok = 1; 121 | if (argc == 3) { 122 | thread_count = strtol(argv[1], NULL, 10); 123 | if (thread_count < 0 || thread_count > MAX_THREADS) 124 | ok = 0; 125 | 126 | n = strtoll(argv[2], NULL, 10); 127 | if (n <= 0) 128 | ok = 0; 129 | } 130 | else 131 | ok = 0; 132 | 133 | if (ok == 0) { 134 | fprintf(stderr, "Usage: %s \n", argv[0]); 135 | fprintf(stderr, " n is the number of terms and should be >= 1\n"); 136 | fprintf(stderr, " n should be evenly divisible by the number of threads\n"); 137 | exit(0); 138 | } 139 | } 140 | 141 | /***************************************************************************** 142 | * Function: Serial_pi 143 | * Purpose: Estimate pi using 1 thread 144 | * In args: n 145 | * Return: Estimate of pi using n terms of Maclaurin series 146 | *****************************************************************************/ 147 | double Serial_pi(long long n) 148 | { 149 | double sum = 0.0; 150 | double factor = 1.0; 151 | 152 | for (long long i = 0; i < n; i++, factor = -factor) 153 | sum += factor / (2*i + 1); 154 | 155 | return 4.0 * sum; 156 | } -------------------------------------------------------------------------------- /pthread/04_pth_pi_busy2.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 04_pth_pi_busy2.c 3 | * Purpose: Estimate pi using serise 4 | * 5 | * pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . .] 6 | * 7 | * This is the second version that uses busy-waiting. 8 | * The critical section now follows the main loop. 9 | * 10 | * Compile: gcc -Wall -o 04_pth_pi_busy2 04_pth_pi_busy2.c -pthread [-lm] 11 | * Run: ./04_pth_pi_busy2 12 | * :the number of terms of the Maclarin series. It should be 13 | * evenly divisible by the number of threads 14 | * 15 | * Input: none 16 | * Output: The estimate of pi using multiple threads, one thread, and the 17 | * value computed by the math library arctan function. 18 | * Also elapsed times for the multithreaded and singlethreaded 19 | * computations. 20 | *****************************************************************************/ 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #define GET_TIME(now) { \ 28 | struct timeval t; \ 29 | gettimeofday(&t, NULL); \ 30 | now = t.tv_sec + t.tv_usec/1000000.0; \ 31 | } 32 | 33 | const int MAX_THREADS = 1024; 34 | 35 | /* global variables */ 36 | long thread_count; 37 | long long n; 38 | double sum; 39 | int flag; 40 | 41 | void *Thread_sum(void* rank); 42 | 43 | void Get_args(int argc, char* argv[]); 44 | double Serial_pi(long long n); 45 | 46 | int main(int argc, char* argv[]) 47 | { 48 | pthread_t* thread_handles; 49 | 50 | Get_args(argc, argv); 51 | 52 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 53 | 54 | double start, finish; 55 | GET_TIME(start); 56 | sum = 0.0; 57 | flag = 0; 58 | for (long thread = 0; thread < thread_count; thread++) 59 | pthread_create(&thread_handles[thread], NULL, Thread_sum, (void*)thread); 60 | 61 | for (long thread = 0; thread < thread_count; thread++) 62 | pthread_join(thread_handles[thread], NULL); 63 | sum *= 4.0; 64 | GET_TIME(finish); 65 | 66 | printf("With n = %lld terms,\n", n); 67 | printf(" Multi-threaded estimate of pi = %.15f\n", sum); 68 | printf(" Elapsed time = %f seconds\n", finish-start); 69 | 70 | GET_TIME(start); 71 | sum = Serial_pi(n); 72 | GET_TIME(finish); 73 | printf(" Single-threaded estimate of pi = %.15f\n", sum); 74 | printf(" Elapsed time = %f seconds\n", finish-start); 75 | printf(" Math library estimate of pi = %.15f\n", 4.0*atan(1.0)); 76 | 77 | free(thread_handles); 78 | 79 | return 0; 80 | } 81 | 82 | /***************************************************************************** 83 | * Function: Thread_sum 84 | * Purpose: Add in the terms computed by the thread running this 85 | * In args: rank 86 | * Return: ignored(NULL) 87 | * Globals in: n, thread_count 88 | * Global in/out: sum 89 | *****************************************************************************/ 90 | void* Thread_sum(void* rank) 91 | { 92 | long my_rank = (long)rank; 93 | long long my_n = n / thread_count; 94 | long long my_first_i = my_n * my_rank; 95 | long long my_last_i = my_first_i + my_n; 96 | 97 | double factor, my_sum = 0.0; 98 | if (my_first_i % 2 == 0) 99 | factor = 1.0; 100 | else 101 | factor = -1.0; 102 | 103 | for (long long i = my_first_i; i < my_last_i; i++, factor = -factor) { 104 | my_sum += factor/(2*i + 1); 105 | } 106 | 107 | while (flag != my_rank); 108 | sum += my_sum; 109 | flag = (flag + 1) % thread_count; 110 | 111 | return NULL; 112 | } 113 | 114 | /***************************************************************************** 115 | * Function: Get_args 116 | * Purpose: Get and check command list arguments 117 | * In args: argc, argv 118 | * Globals out: thread_count, n 119 | *****************************************************************************/ 120 | void Get_args(int argc, char* argv[]) 121 | { 122 | int ok = 1; 123 | if (argc == 3) { 124 | thread_count = strtol(argv[1], NULL, 10); 125 | if (thread_count < 0 || thread_count > MAX_THREADS) 126 | ok = 0; 127 | 128 | n = strtoll(argv[2], NULL, 10); 129 | if (n <= 0) 130 | ok = 0; 131 | } 132 | else 133 | ok = 0; 134 | 135 | if (ok == 0) { 136 | fprintf(stderr, "Usage: %s \n", argv[0]); 137 | fprintf(stderr, " n is the number of terms and should be >= 1\n"); 138 | fprintf(stderr, " n should be evenly divisible by the number of threads\n"); 139 | exit(0); 140 | } 141 | } 142 | 143 | /***************************************************************************** 144 | * Function: Serial_pi 145 | * Purpose: Estimate pi using 1 thread 146 | * In args: n 147 | * Return: Estimate of pi using n terms of Maclaurin series 148 | *****************************************************************************/ 149 | double Serial_pi(long long n) 150 | { 151 | double sum = 0.0; 152 | double factor = 1.0; 153 | 154 | for (long long i = 0; i < n; i++, factor = -factor) 155 | sum += factor / (2*i + 1); 156 | 157 | return 4.0 * sum; 158 | } -------------------------------------------------------------------------------- /pthread/06_pth_message.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 06_pth_message.c 3 | * Purpose: Illustrate a synchronization problem with pthreads: 4 | * create some threads, each of which creates and sends it to 5 | * another thread, by copying it into that thread's buffer. 6 | * 7 | * Compile: gcc -Wall -o 06_pth_message 06_pth_message.c -pthread [-lm] 8 | * Run: ./06_pth_message 9 | * 10 | * Input: none 11 | * Output: message from each thread 12 | *****************************************************************************/ 13 | #include 14 | #include 15 | #include 16 | 17 | const int MAX_THREADS = 1024; 18 | const int MSG_MAX = 100; 19 | 20 | /* Global variables */ 21 | long thread_count; 22 | char** messages; 23 | 24 | void* Send_message(void* rank); /* Thread function */ 25 | 26 | int main(int argc, char* argv[]) 27 | { 28 | pthread_t* thread_handles; 29 | 30 | if (argc != 2) { 31 | fprintf(stderr, "Usage: %s \n", argv[0]); 32 | exit(0); 33 | } 34 | 35 | thread_count = strtol(argv[1], NULL, 10); 36 | if (thread_count <= 0 || thread_count > MAX_THREADS) { 37 | fprintf(stderr, "The number of threads should be > 0 and < %d\n", MAX_THREADS); 38 | exit(0); 39 | } 40 | 41 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 42 | messages = (char**)malloc(thread_count * sizeof(char*)); 43 | 44 | for (long thread = 0; thread < thread_count; thread++) 45 | messages[thread] = NULL; 46 | for (long thread = 0; thread < thread_count; thread++) 47 | pthread_create(&thread_handles[thread], NULL, Send_message, (void*)thread); 48 | for (long thread = 0; thread < thread_count; thread++) 49 | pthread_join(thread_handles[thread], NULL); 50 | 51 | for (long thread = 0; thread < thread_count; thread++) 52 | free(messages[thread]); 53 | free(messages); 54 | 55 | free(thread_handles); 56 | 57 | return 0; 58 | } 59 | 60 | /***************************************************************************** 61 | * Function: Send_message 62 | * Purpose: Create a message and send it by copying it into 63 | * global messages array. Receive a message and print it. 64 | * In args: rank 65 | * Global in: thread_count 66 | * Global in/out: messages 67 | * Return: ignored(NULL) 68 | * Note: The my_msg buffer is freed in main function 69 | *****************************************************************************/ 70 | void* Send_message(void* rank) 71 | { 72 | long my_rank = (long)rank; 73 | long dest = (my_rank + 1) % thread_count; 74 | long src = (my_rank + thread_count - 1) % thread_count; 75 | char* my_msg = (char*)malloc(MSG_MAX * sizeof(char)); 76 | 77 | sprintf(my_msg, "Hello to %ld from %ld", dest, my_rank); 78 | messages[dest] = my_msg; 79 | 80 | if (messages[my_rank] != NULL) 81 | printf("Thread %ld > %s\n", my_rank, messages[my_rank]); 82 | else 83 | printf("Thread %ld > No message from %ld\n", my_rank, src); 84 | 85 | return NULL; 86 | } -------------------------------------------------------------------------------- /pthread/07_pth_message_sem.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 07_pth_message_sem.c 3 | * Purpose: Illustrate a synchronization problem with pthreads: 4 | * create some threads, each of which creates and sends it to 5 | * another thread, by copying it into that thread's buffer. 6 | * This version uses semaphores to solve the synchronization problem 7 | * 8 | * Compile: gcc -Wall -o 07_pth_message_sem 07_pth_message_sem.c -pthread [-lm] 9 | * Run: ./07_pth_message_sem 10 | * 11 | * Input: none 12 | * Output: message from each thread 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | const int MAX_THREADS = 1024; 20 | const int MSG_MAX = 100; 21 | 22 | /* Global variables */ 23 | long thread_count; 24 | char** messages; 25 | sem_t* semaphores; 26 | 27 | void* Send_message(void* rank); /* Thread function */ 28 | 29 | int main(int argc, char* argv[]) 30 | { 31 | pthread_t* thread_handles; 32 | 33 | if (argc != 2) { 34 | fprintf(stderr, "Usage: %s \n", argv[0]); 35 | exit(0); 36 | } 37 | 38 | thread_count = strtol(argv[1], NULL, 10); 39 | if (thread_count <= 0 || thread_count > MAX_THREADS) { 40 | fprintf(stderr, "The number of threads should be > 0 and < %d\n", MAX_THREADS); 41 | exit(0); 42 | } 43 | 44 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 45 | messages = (char**)malloc(thread_count * sizeof(char*)); 46 | semaphores = (sem_t*)malloc(thread_count * sizeof(sem_t)); 47 | 48 | for (long thread = 0; thread < thread_count; thread++) { 49 | messages[thread] = NULL; 50 | sem_init(&semaphores[thread], 0, 0); 51 | } 52 | for (long thread = 0; thread < thread_count; thread++) 53 | pthread_create(&thread_handles[thread], NULL, Send_message, (void*)thread); 54 | for (long thread = 0; thread < thread_count; thread++) 55 | pthread_join(thread_handles[thread], NULL); 56 | 57 | for (long thread = 0; thread < thread_count; thread++) { 58 | free(messages[thread]); 59 | sem_destroy(&semaphores[thread]); 60 | } 61 | free(messages); 62 | free(semaphores); 63 | free(thread_handles); 64 | 65 | return 0; 66 | } 67 | 68 | /***************************************************************************** 69 | * Function: Send_message 70 | * Purpose: Create a message and send it by copying it into 71 | * global messages array. Receive a message and print it. 72 | * In args: rank 73 | * Global in: thread_count 74 | * Global in/out: messages, semaphores 75 | * Return: ignored(NULL) 76 | * Note: The my_msg buffer is freed in main function 77 | *****************************************************************************/ 78 | void* Send_message(void* rank) 79 | { 80 | long my_rank = (long)rank; 81 | long dest = (my_rank + 1) % thread_count; 82 | char* my_msg = (char*)malloc(MSG_MAX * sizeof(char)); 83 | 84 | sprintf(my_msg, "Hello to %ld from %ld", dest, my_rank); 85 | messages[dest] = my_msg; 86 | sem_post(&semaphores[dest]); // increase semaphores[dest] by 1 -> 'unlock' the semaphore of dest 87 | 88 | sem_wait(&semaphores[my_rank]); // decrease semaphores[my_rank] by 1 and return -> wait for our semaphore to be unlocked 89 | printf("Thread %ld > %s\n", my_rank, messages[my_rank]); 90 | 91 | return NULL; 92 | } -------------------------------------------------------------------------------- /pthread/08_pth_busy_barrier.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 08_pth_busy_barrier.c 3 | * Purpose: Use busy-waiting and mutex barriers to synchronize threads 4 | * 5 | * Compile: gcc -Wall -o 08_pth_busy_barrier 08_pth_busy_barrier.c -pthread 6 | * [-DDEBUG] 7 | * Run: ./08_pth_busy_barrier 8 | * 9 | * Input: none 10 | * Output: Time for BARRIER_COUNT barriers 11 | * 12 | * Note: Verbose output can be enabled with the compile flag -DDEBUG 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define GET_TIME(now) \ 20 | { \ 21 | struct timeval t; \ 22 | gettimeofday(&t, NULL); \ 23 | now = t.tv_sec + t.tv_usec / 1000000.0; \ 24 | } 25 | 26 | #define BARRIER_COUNT 100 27 | 28 | /* Global variables */ 29 | long thread_count; 30 | int barrier_thread_counts[BARRIER_COUNT]; 31 | pthread_mutex_t barrier_mutex; 32 | 33 | void* Thread_work(void* rank); 34 | 35 | int main(int argc, char* argv[]) 36 | { 37 | pthread_t* thread_handles; 38 | 39 | if (argc != 2) { 40 | fprintf(stderr, "Usage: %s \n", argv[0]); 41 | exit(0); 42 | } 43 | 44 | thread_count = strtol(argv[1], NULL, 10); 45 | if (thread_count <= 0) { 46 | fprintf(stderr, "The number of threads should be > 0\n"); 47 | exit(0); 48 | } 49 | 50 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 51 | for (int i = 0; i < BARRIER_COUNT; i++) 52 | barrier_thread_counts[i] = 0; 53 | pthread_mutex_init(&barrier_mutex, NULL); 54 | 55 | double start, finish; 56 | GET_TIME(start); 57 | for (long thread = 0; thread < thread_count; thread++) 58 | pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread); 59 | for (long thread = 0; thread < thread_count; thread++) 60 | pthread_join(thread_handles[thread], NULL); 61 | GET_TIME(finish); 62 | 63 | printf("Elapsed time = %f seconds\n", finish - start); 64 | 65 | pthread_mutex_destroy(&barrier_mutex); 66 | free(thread_handles); 67 | 68 | return 0; 69 | } 70 | 71 | /***************************************************************************** 72 | * Function: Thread_work 73 | * Purpose: Run BARRIER_COUNT barriers 74 | * In args: rank 75 | * Global var: thread_count, barrier_thread_counts, barrier_mutex 76 | * Return: ignored(NULL) 77 | *****************************************************************************/ 78 | void* Thread_work(void* rank) 79 | { 80 | #ifdef DEBUG 81 | long my_rank = (long)rank; 82 | #endif 83 | 84 | for (int i = 0; i < BARRIER_COUNT; i++) { 85 | pthread_mutex_lock(&barrier_mutex); 86 | barrier_thread_counts[i]++; 87 | pthread_mutex_unlock(&barrier_mutex); 88 | while (barrier_thread_counts[i] < thread_count); 89 | #ifdef DEBUG 90 | if (my_rank == 0) { 91 | printf("All threads completed barrier %d\n", i); 92 | fflush(stdout); 93 | } 94 | #endif 95 | } 96 | 97 | return NULL; 98 | } -------------------------------------------------------------------------------- /pthread/09_pth_sem_barrier.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 09_pth_sem_barrier.c 3 | * Purpose: Use semaphore barriers to synchronize threads 4 | * 5 | * Compile: gcc -Wall -o 09_pth_sem_barrier 09_pth_sem_barrier.c -pthread 6 | * [-DDEBUG] 7 | * Run: ./09_pth_sem_barrier 8 | * 9 | * Input: none 10 | * Output: Time for BARRIER_COUNT barriers 11 | * 12 | * Note: Verbose output can be enabled with the compile flag -DDEBUG 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define GET_TIME(now) \ 21 | { \ 22 | struct timeval t; \ 23 | gettimeofday(&t, NULL); \ 24 | now = t.tv_sec + t.tv_usec / 1000000.0; \ 25 | } 26 | 27 | #define BARRIER_COUNT 100 28 | 29 | /* Global variables */ 30 | long thread_count; 31 | long counter; 32 | sem_t barrier_sems[BARRIER_COUNT]; 33 | sem_t count_sem; 34 | 35 | void* Thread_work(void* rank); 36 | 37 | int main(int argc, char* argv[]) 38 | { 39 | pthread_t* thread_handles; 40 | 41 | if (argc != 2) { 42 | fprintf(stderr, "Usage: %s \n", argv[0]); 43 | exit(0); 44 | } 45 | 46 | thread_count = strtol(argv[1], NULL, 10); 47 | if (thread_count <= 0) { 48 | fprintf(stderr, "The number of threads should be > 0\n"); 49 | exit(0); 50 | } 51 | 52 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 53 | for (int i = 0; i < BARRIER_COUNT; i++) 54 | sem_init(&barrier_sems[i], 0, 0); 55 | sem_init(&count_sem, 0, 1); 56 | 57 | double start, finish; 58 | GET_TIME(start); 59 | for (long thread = 0; thread < thread_count; thread++) 60 | pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread); 61 | for (long thread = 0; thread < thread_count; thread++) 62 | pthread_join(thread_handles[thread], NULL); 63 | GET_TIME(finish); 64 | 65 | printf("Elapsed time = %f seconds\n", finish - start); 66 | 67 | sem_destroy(&count_sem); 68 | for (int i = 0; i < BARRIER_COUNT; i++) 69 | sem_destroy(&barrier_sems[i]); 70 | free(thread_handles); 71 | 72 | return 0; 73 | } 74 | 75 | /***************************************************************************** 76 | * Function: Thread_work 77 | * Purpose: Run BARRIER_COUNT barriers 78 | * In args: rank 79 | * Global var: thread_count, count, barrier_sems, count_sem 80 | * Return: ignored(NULL) 81 | *****************************************************************************/ 82 | void* Thread_work(void* rank) 83 | { 84 | #ifdef DEBUG 85 | long my_rank = (long)rank; 86 | #endif 87 | 88 | for (int i = 0; i < BARRIER_COUNT; i++) { 89 | sem_wait(&count_sem); 90 | if (counter == thread_count - 1) { 91 | counter = 0; 92 | sem_post(&count_sem); 93 | for (int j = 0; j < thread_count - 1; j++) 94 | sem_post(&barrier_sems[i]); 95 | } 96 | else { 97 | counter++; 98 | sem_post(&count_sem); 99 | sem_wait(&barrier_sems[i]); 100 | } 101 | #ifdef DEBUG 102 | if (my_rank == 0) { 103 | printf("All threads completed barrier %d\n", i); 104 | fflush(stdout); 105 | } 106 | #endif 107 | } 108 | 109 | return NULL; 110 | } -------------------------------------------------------------------------------- /pthread/10_pth_cond_barrier.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 10_pth_cond_barrier.c 3 | * Purpose: Use condition wait barriers to synchronize threads 4 | * 5 | * Compile: gcc -Wall -o 10_pth_cond_barrier 10_pth_cond_barrier.c -pthread 6 | * [-DDEBUG] 7 | * Run: ./10_pth_cond_barrier 8 | * 9 | * Input: none 10 | * Output: Time for BARRIER_COUNT barriers 11 | * 12 | * Note: Verbose output can be enabled with the compile flag -DDEBUG 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define GET_TIME(now) \ 20 | { \ 21 | struct timeval t; \ 22 | gettimeofday(&t, NULL); \ 23 | now = t.tv_sec + t.tv_usec / 1000000.0; \ 24 | } 25 | //#define DEBUG 26 | #define BARRIER_COUNT 100 27 | 28 | /* Global variables */ 29 | long thread_count, barrier_thread_count; 30 | pthread_mutex_t barrier_mutex; 31 | pthread_cond_t ok_to_proceed; 32 | 33 | void* Thread_work(void* rank); 34 | 35 | int main(int argc, char* argv[]) 36 | { 37 | pthread_t* thread_handles; 38 | 39 | if (argc != 2) { 40 | fprintf(stderr, "Usage: %s \n", argv[0]); 41 | exit(0); 42 | } 43 | 44 | thread_count = strtol(argv[1], NULL, 10); 45 | if (thread_count <= 0) { 46 | fprintf(stderr, "The number of threads should be > 0\n"); 47 | exit(0); 48 | } 49 | 50 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 51 | pthread_mutex_init(&barrier_mutex, NULL); 52 | pthread_cond_init(&ok_to_proceed, NULL); 53 | 54 | double start, finish; 55 | GET_TIME(start); 56 | for (long thread = 0; thread < thread_count; thread++) 57 | pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread); 58 | for (long thread = 0; thread < thread_count; thread++) 59 | pthread_join(thread_handles[thread], NULL); 60 | GET_TIME(finish); 61 | 62 | printf("Elapsed time = %f seconds\n", finish - start); 63 | 64 | pthread_mutex_destroy(&barrier_mutex); 65 | pthread_cond_destroy(&ok_to_proceed); 66 | free(thread_handles); 67 | 68 | return 0; 69 | } 70 | 71 | /***************************************************************************** 72 | * Function: Thread_work 73 | * Purpose: Run BARRIER_COUNT barriers 74 | * In args: rank 75 | * Global var: thread_count, barrier_thread_count, barrier_mutex 76 | * Return: ignored(NULL) 77 | *****************************************************************************/ 78 | void* Thread_work(void* rank) 79 | { 80 | #ifdef DEBUG 81 | long my_rank = (long)rank; 82 | #endif 83 | 84 | for (int i = 0; i < BARRIER_COUNT; i++) { 85 | pthread_mutex_lock(&barrier_mutex); 86 | barrier_thread_count++; 87 | 88 | if (barrier_thread_count == thread_count) { 89 | barrier_thread_count = 0; 90 | #ifdef DEBUG 91 | printf("Thread %ld > Signalling other threads in barrier %d\n", my_rank, i); 92 | fflush(stdout); 93 | #endif 94 | pthread_cond_broadcast(&ok_to_proceed); 95 | } 96 | else { 97 | // Wait unlocks mutex and puts thread to sleep. 98 | // Put wait in while loop in case some other 99 | // event awakens thread. 100 | while (pthread_cond_wait(&ok_to_proceed, &barrier_mutex) != 0); 101 | // Mutex is relocked at this point. 102 | #ifdef DEBUG 103 | printf("Thread %ld > Awakened in barrier %d\n", my_rank, i); 104 | #endif 105 | } 106 | pthread_mutex_unlock(&barrier_mutex); 107 | #ifdef DEBUG 108 | if (my_rank == 0) { 109 | printf("All threads completed barrier %d\n", i); 110 | fflush(stdout); 111 | } 112 | #endif 113 | } 114 | 115 | return NULL; 116 | } -------------------------------------------------------------------------------- /pthread/11_pth_posix_barrier.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 11_pth_posix_barrier.c 3 | * Purpose: Use POSIX barrier to synchronize threads 4 | * 5 | * Compile: gcc -Wall -o 11_pth_posix_barrier 11_pth_posix_barrier.c -pthread 6 | * [-DDEBUG] 7 | * Run: ./11_pth_posix_barrier 8 | * 9 | * Input: none 10 | * Output: Time for BARRIER_COUNT barriers 11 | * 12 | * Note: Verbose output can be enabled with the compile flag -DDEBUG 13 | *****************************************************************************/ 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define GET_TIME(now) \ 20 | { \ 21 | struct timeval t; \ 22 | gettimeofday(&t, NULL); \ 23 | now = t.tv_sec + t.tv_usec / 1000000.0; \ 24 | } 25 | 26 | #define BARRIER_COUNT 100 27 | 28 | /* Global variables */ 29 | long thread_count; 30 | pthread_barrier_t barrier; 31 | 32 | void* Thread_work(void* rank); 33 | 34 | int main(int argc, char* argv[]) 35 | { 36 | pthread_t* thread_handles; 37 | 38 | if (argc != 2) { 39 | fprintf(stderr, "Usage: %s \n", argv[0]); 40 | exit(0); 41 | } 42 | 43 | thread_count = strtol(argv[1], NULL, 10); 44 | if (thread_count <= 0) { 45 | fprintf(stderr, "The number of threads should be > 0\n"); 46 | exit(0); 47 | } 48 | 49 | thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 50 | pthread_barrier_init(&barrier, NULL, thread_count); 51 | 52 | double start, finish; 53 | GET_TIME(start); 54 | for (long thread = 0; thread < thread_count; thread++) 55 | pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread); 56 | for (long thread = 0; thread < thread_count; thread++) 57 | pthread_join(thread_handles[thread], NULL); 58 | GET_TIME(finish); 59 | 60 | printf("Elapsed time = %f seconds\n", finish - start); 61 | 62 | pthread_barrier_destroy(&barrier); 63 | free(thread_handles); 64 | 65 | return 0; 66 | } 67 | 68 | /***************************************************************************** 69 | * Function: Thread_work 70 | * Purpose: Run BARRIER_COUNT barriers 71 | * In args: rank 72 | * Global var: thread_count, barrier 73 | * Return: ignored(NULL) 74 | *****************************************************************************/ 75 | void* Thread_work(void* rank) 76 | { 77 | #ifdef DEBUG 78 | long my_rank = (long)rank; 79 | #endif 80 | 81 | for (int i = 0; i < BARRIER_COUNT; i++) { 82 | pthread_barrier_wait(&barrier); 83 | #ifdef DEBUG 84 | if (my_rank == 0) { 85 | printf("All threads completed barrier %d\n", i); 86 | fflush(stdout); 87 | } 88 | #endif 89 | } 90 | 91 | return NULL; 92 | } -------------------------------------------------------------------------------- /pthread/12_pth_tokenize.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * File: 12_pth_tokenize.c 3 | * Purpose: Try to use threads to tokenize text input. Illustrate problems 4 | * with function that isn't thread-safe. 5 | * 6 | * * This program deinitely has problems. 7 | * 8 | * Compile: gcc -Wall -o 12_pth_tokenize 12_pth_tokenize.c -pthread 9 | * Run: ./12_pth_tokenize 10 | * 11 | * Input: Lines of text 12 | * Output: For each line of input: 13 | * the line read by the program, and the tokens identified by 14 | * strtok 15 | *****************************************************************************/ 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const int MAX = 1000; 23 | 24 | long thread_count; 25 | sem_t* sems; 26 | 27 | void Usage(char* prog_name); 28 | void *Tokenize(void* rank); /* thread function */ 29 | 30 | int main(int argc, char* argv[]) 31 | { 32 | if (argc != 2) 33 | Usage(argv[0]); 34 | thread_count = atoi(argv[1]); 35 | 36 | pthread_t* thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t)); 37 | sems = (sem_t*)malloc(thread_count * sizeof(sem_t)); 38 | // sems[0] should be unlocked, the others should be locked 39 | sem_init(&sems[0], 0, 1); 40 | for (long thread = 1; thread < thread_count; thread++) 41 | sem_init(&sems[thread], 0, 0); 42 | 43 | printf("Enter text\n"); 44 | for (long thread = 0; thread < thread_count; thread++) 45 | pthread_create(&thread_handles[thread], NULL, Tokenize, (void*)thread); 46 | 47 | for (long thread = 0; thread < thread_count; thread++) 48 | pthread_join(thread_handles[thread], NULL); 49 | 50 | for (long thread = 0; thread < thread_count; thread++) 51 | sem_destroy(&sems[thread]); 52 | 53 | free(sems); 54 | free(thread_handles); 55 | return 0; 56 | } 57 | 58 | /***************************************************************************** 59 | * Function: Usage 60 | * Purpose: Print command line for function and terminate 61 | * In args: prog_name 62 | *****************************************************************************/ 63 | void Usage(char* prog_name) 64 | { 65 | fprintf(stderr, "Usage: %s \n", prog_name); 66 | exit(0); 67 | } 68 | 69 | /***************************************************************************** 70 | * Function: Tokenize 71 | * Purpose: Tokenize lines of input 72 | * In args: rank 73 | * Global var: thread_count, sems 74 | * Return: ignored(NULL) 75 | *****************************************************************************/ 76 | void* Tokenize(void* rank) 77 | { 78 | long my_rank = (long)rank; 79 | int count; 80 | int next = (my_rank + 1) % thread_count; 81 | char* fg_rv; 82 | char my_line[MAX]; 83 | char* my_string; 84 | 85 | /* Force sequential reading of the input */ 86 | sem_wait(&sems[my_rank]); 87 | fg_rv = fgets(my_line, MAX, stdin); 88 | sem_post(&sems[next]); 89 | 90 | while (fg_rv != NULL) { 91 | printf("Thread %ld > my_line = %s", my_rank, my_line); 92 | 93 | count = 0; 94 | my_string = strtok(my_line, " \t\n"); 95 | while (my_string != NULL) { 96 | count++; 97 | printf("Thread %ld > string %d = %s\n", my_rank, count, my_string); 98 | my_string = strtok(NULL, " \t\n"); 99 | } 100 | 101 | //if (my_line != NULL) 102 | //printf("Thread %ld > After tokenizing, my_line = %s\n", my_rank, my_line); 103 | 104 | sem_wait(&sems[my_rank]); 105 | fg_rv = fgets(my_line, MAX, stdin); 106 | sem_post(&sems[next]); 107 | } 108 | 109 | return NULL; 110 | } --------------------------------------------------------------------------------