├── .gitignore ├── Chapter01 └── 01_cuda_introduction │ ├── 01_hello_world │ ├── Makefile │ └── hello_world.cu │ ├── 02_vector_addition │ ├── Makefile │ ├── vector_addition.cu │ ├── vector_addition_gpu_block_only.cu │ ├── vector_addition_gpu_thread_block.cu │ └── vector_addition_gpu_thread_only.cu │ └── Makefile ├── Chapter02 └── 02_memory_overview │ ├── 01_sgemm │ ├── Makefile │ └── sgemm.cu │ ├── 02_vector_addition │ ├── Makefile │ └── vector_addition_gpu_thread_block.cu │ ├── 03_aos_soa │ ├── Makefile │ ├── aos.cu │ └── soa.cu │ ├── 04_matrix_transpose │ ├── Makefile │ ├── conflict_solved.cu │ └── matrix_transpose.cu │ ├── 05_image_scaling │ ├── Makefile │ ├── aerosmith-double.pgm │ ├── image_scaling.cu │ ├── scrImagePgmPpmPackage.cpp │ ├── scrImagePgmPpmPackage.h │ └── voyager2.pgm │ ├── 06_unified_memory │ ├── Makefile │ ├── unified_memory.cu │ ├── unified_memory_64align.cu │ ├── unified_memory_initialized.cu │ └── unified_memory_prefetch.cu │ └── Makefile ├── Chapter03 └── 03_cuda_thread_programming │ ├── 01_warp_and_thread_block │ ├── Makefile │ └── cuda_thread_block.cu │ ├── 02_cuda_occupancy │ ├── Makefile │ └── sgemm.cu │ ├── 03_threadsync_and_reduction │ ├── Makefile │ ├── reduction.h │ ├── reduction_global.cpp │ ├── reduction_global_kernel.cu │ ├── reduction_shared.cpp │ └── reduction_shared_kernel.cu │ ├── 04_performance_limiter │ ├── Makefile │ ├── reduction.h │ ├── reduction_shared.cpp │ ├── reduction_shared_kernel.cu │ └── sgemm.cu │ ├── 05_warp_divergence │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_kernel_interleaving.cu │ └── reduction_kernel_sequential.cu │ ├── 06_limiter_balancing │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_kernel.cu │ └── reduction_kernel_opt.cu │ ├── 07_warp_synchronous_programming │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ └── reduction_wp_kernel.cu │ ├── 08_cooperative_group │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_cg_kernel.cu │ └── reduction_cg_shift_kernel.cu │ ├── 09_loop_unrolling │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_cg_kernel.cu │ └── reduction_wp_kernel.cu │ ├── 10_atomic_operation │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_blk_atmc_kernel.cu │ ├── reduction_kernel.cu │ └── reduction_wrp_atmc_kernel.cu │ ├── 11_mixed_precision_operation │ ├── Makefile │ ├── mixed_precision.cu │ ├── mixed_precision_half.cu │ ├── mixed_precision_int.cu │ └── util.cuh │ └── Makefile ├── Chapter04 └── 04_kernel_execution │ ├── 01_cuda_stream │ ├── 1_cuda_default_stream.cu │ ├── 2_cuda_multi_stream.cu │ ├── 3_cuda_multi_stream_with_sync.cu │ ├── 4_cuda_multi_stream_with_default.cu │ └── Makefile │ ├── 02_pipelining │ ├── Makefile │ └── cuda_pipelining.cu │ ├── 03_cuda_callback │ ├── Makefile │ └── cuda_callback.cu │ ├── 04_stream_priority │ ├── Makefile │ └── prioritized_cuda_stream.cu │ ├── 05_cuda_event │ ├── Makefile │ ├── cuda_event.cu │ └── cuda_event_with_streams.cu │ ├── 06_dynamic_parallelism │ ├── Makefile │ ├── dynamic_parallelism.cu │ └── recursion.cu │ ├── 07_grid_level_cg │ ├── Makefile │ ├── reduction.cpp │ ├── reduction.h │ └── reduction_kernel.cu │ ├── 08_openmp_cuda │ ├── Makefile │ ├── openmp.cu │ ├── openmp_default_stream.cu │ └── openmp_gpus.cu │ ├── 09_mps │ ├── Makefile │ ├── install_mpi.sh │ └── simpleMPI.cu │ ├── 10_kernel_execution_overhead │ ├── Makefile │ └── cuda_kernel.cu │ └── Makefile ├── Chapter05 └── 05_debug_profiling │ ├── .gitignore │ ├── 01_focused_profile │ ├── Makefile │ └── sgemm.cu │ ├── 02_nvtx │ ├── Makefile │ └── sgemm.cu │ ├── 03_cuda_error │ ├── Makefile │ └── sgemm.cu │ ├── 04_cuda_assert │ ├── Makefile │ └── sgemm.cu │ ├── 05_debug_with_vs │ ├── debug_vs.vcxproj │ └── simple_sgemm.cu │ ├── 06_debug_with_eclipse │ ├── .cproject │ ├── .gitignore │ ├── .project │ ├── .settings │ │ └── org.eclipse.ltk.core.refactoring.prefs │ ├── Debug │ │ ├── makefile │ │ ├── objects.mk │ │ ├── sources.mk │ │ └── src │ │ │ ├── simple_sgemm.d │ │ │ └── subdir.mk │ ├── java-7-install.md │ └── src │ │ └── simple_sgemm.cu │ ├── 07_debug_with_gdb │ ├── Makefile │ └── simple_sgemm.cu │ ├── 08_memcheck │ ├── Makefile │ ├── simple_sgemm.cu │ ├── simple_sgemm_mem_leak.cu │ └── simple_sgemm_oob.cu │ └── Makefile ├── Chapter06 └── 06_multigpu │ ├── 01_gaussian_single_gpu │ ├── config.h │ ├── cpuSolver.h │ ├── gaussian_sequential.cu │ ├── gaussian_single_gpu.cu │ ├── gaussian_single_gpu.h │ ├── gpuSolver.cu │ ├── gpuSolverFunctions.cu │ ├── linearSystemOps.cu │ └── linearSystemOps.h │ ├── 02_gaussian_multi_gpu │ ├── Makefile │ ├── config.h │ ├── gaussian_multi_gpu_p2p.cu │ ├── gaussian_multi_gpu_p2p.h │ ├── gpuSolver.cu │ ├── gpuSolverFunctions.cu │ ├── linearSystemOps.cu │ └── utilities.cu │ ├── 03_helloWorldMPI │ └── helloWorldMPI.c │ ├── 04_gaussian_multi_node │ ├── Makefile │ ├── config.h │ ├── elementUtilities.cu │ ├── gaussian_multi_gpu_rdma.c │ ├── gaussian_multi_gpu_rdma.h │ ├── gpuSolver.cu │ ├── gpuSolverFunctions.cu │ ├── linearSystemOps.c │ ├── mpiUtils.h │ └── utilities.cu │ ├── 05_streams │ ├── Makefile │ ├── cat.pgm │ ├── dog.pgm │ ├── image_merging.cu │ ├── scrImagePgmPpmPackage.cu │ ├── scrImagePgmPpmPackage.h │ └── vector_addition.cu │ ├── 06_nccl │ ├── Makefile │ └── nccl.cu │ └── Makefile ├── Chapter07 └── 07_parallel_programming_pattern │ ├── 01_sgemm_optimization │ ├── Makefile │ └── sgemm.cu │ ├── 02_convolution │ ├── Makefile │ └── convolution.cu │ ├── 03_scan │ ├── Makefile │ ├── scan.cu │ ├── scan.h │ ├── scan_v1.cu │ ├── scan_v2.cu │ └── utils.h │ ├── 04_pack_n_split │ ├── Makefile │ └── pack_n_split.cu │ ├── 05_n-body │ ├── Makefile │ ├── n_body.cu │ └── n_body.h │ ├── 06_quicksort │ ├── Makefile │ └── quick_sort.cu │ ├── 07_radixsort │ ├── Makefile │ ├── radix_warp_sort.cu │ └── thrust_radix_sort.cu │ ├── 08_histogram │ ├── Makefile │ ├── aerosmith-double.pgm │ ├── image_histogram.cu │ ├── scrImagePgmPpmPackage.cpp │ └── scrImagePgmPpmPackage.h │ └── Makefile ├── Chapter08 └── 08_cuda_libs_and_other_languages │ ├── 01_sgemm │ ├── Makefile │ ├── cublasSgemm.cpp │ ├── cublasSgemm_async.cpp │ └── cublasXtSgemm.cpp │ ├── 02_sgemm_mixed_precision │ ├── Makefile │ ├── cublasGemmEx.cu │ └── helper.cuh │ ├── 03_curand │ ├── Makefile │ ├── curand_device.cu │ ├── curand_host.cpp │ ├── fp16.cu │ ├── fp16.cuh │ └── gemm_with_curand_host.cpp │ ├── 04_cufft │ ├── Makefile │ ├── complex.cu │ ├── cufft.1d.cpp │ ├── cufft.half.cpp │ ├── cufft.mgpu.cu │ ├── fp16.cu │ ├── fp16.cuh │ └── helper.cuh │ ├── 05_npp │ ├── Makefile │ ├── flower.jpg │ ├── imageFilter.cpp │ ├── output.jpg │ └── statisticsNPP.cpp │ ├── 06_opencv │ ├── Makefile │ ├── blur.cpp │ ├── blur_cuvid.cpp │ ├── blur_stream.cpp │ ├── flower.JPG │ ├── install_opencv.sh │ └── test.cpp │ ├── 07_python_cuda │ ├── cupy_op.py │ ├── numba_matmul.py │ ├── numba_saxpy.py │ ├── pycuda_matmul.py │ └── pycuda_matmul_simple.py │ ├── 08_nvblas │ ├── exec_fft.m │ ├── fft.R │ ├── nvblas.conf │ ├── sgemm.R │ └── sgemm.m │ ├── 09_matlab │ ├── cuda.m │ └── host.m │ └── Makefile ├── Chapter09 └── 09_openacc │ ├── Makefile │ ├── cat.pgm │ ├── dog.pgm │ ├── image_merging.cpp │ ├── scrImagePgmPpmPackage.cpp │ └── scrImagePgmPpmPackage.h ├── Chapter10 └── 10_deep_learning │ ├── 01_ann │ ├── Makefile │ ├── ann.vcxproj │ ├── download_mnist.bat │ ├── download_mnist.sh │ ├── src │ │ ├── blob.h │ │ ├── helper.h │ │ ├── layer.cu │ │ ├── layer.h │ │ ├── loss.cu │ │ ├── loss.h │ │ ├── mnist.cpp │ │ ├── mnist.h │ │ ├── network.cpp │ │ └── network.h │ └── train.cpp │ ├── 02_cnn │ ├── Makefile │ ├── cnn.vcxproj │ ├── download_mnist.bat │ ├── download_mnist.sh │ ├── src │ │ ├── blob.h │ │ ├── helper.h │ │ ├── layer.cu │ │ ├── layer.h │ │ ├── loss.cu │ │ ├── loss.h │ │ ├── mnist.cpp │ │ ├── mnist.h │ │ ├── network.cpp │ │ └── network.h │ └── train.cpp │ ├── 03_rnn │ ├── Makefile │ └── rnn.cpp │ ├── 04_framework_profile │ ├── pytorch │ │ ├── README.md │ │ ├── RN50v1.5 │ │ │ ├── GPU_1.log │ │ │ ├── README.md │ │ │ ├── examples │ │ │ │ ├── RN50_FP16_1GPU.sh │ │ │ │ ├── RN50_FP16_4GPU.sh │ │ │ │ ├── RN50_FP16_8GPU.sh │ │ │ │ ├── RN50_FP16_EVAL.sh │ │ │ │ ├── RN50_FP16_INFERENCE_BENCHMARK.sh │ │ │ │ ├── RN50_FP32_1GPU.sh │ │ │ │ ├── RN50_FP32_4GPU.sh │ │ │ │ ├── RN50_FP32_8GPU.sh │ │ │ │ ├── RN50_FP32_EVAL.sh │ │ │ │ └── RN50_FP32_INFERENCE_BENCHMARK.sh │ │ │ ├── image_classification │ │ │ │ ├── __init__.py │ │ │ │ ├── dataloaders.py │ │ │ │ ├── logger.py │ │ │ │ ├── mixup.py │ │ │ │ ├── resnet.py │ │ │ │ ├── smoothing.py │ │ │ │ ├── training.py │ │ │ │ └── utils.py │ │ │ ├── img │ │ │ │ ├── .gitkeep │ │ │ │ ├── DGX2_250_loss.png │ │ │ │ ├── DGX2_250_top1.png │ │ │ │ ├── DGX2_250_top5.png │ │ │ │ ├── training_accuracy.png │ │ │ │ ├── training_loss.png │ │ │ │ └── validation_accuracy.png │ │ │ ├── main.py │ │ │ ├── multiproc.py │ │ │ ├── resnet50_pyt.qdrep │ │ │ ├── resnet50_pyt_2g.qdrep │ │ │ ├── resnet50v1.5 │ │ │ │ ├── README.md │ │ │ │ └── training │ │ │ │ │ ├── DGX1_RN50_FP16_250E.sh │ │ │ │ │ ├── DGX1_RN50_FP16_50E.sh │ │ │ │ │ ├── DGX1_RN50_FP16_90E.sh │ │ │ │ │ ├── DGX1_RN50_FP32_250E.sh │ │ │ │ │ ├── DGX1_RN50_FP32_50E.sh │ │ │ │ │ ├── DGX1_RN50_FP32_90E.sh │ │ │ │ │ ├── DGX2_RN50_FP16_250E.sh │ │ │ │ │ ├── DGX2_RN50_FP16_50E.sh │ │ │ │ │ ├── DGX2_RN50_FP16_90E.sh │ │ │ │ │ ├── DGX2_RN50_FP32_250E.sh │ │ │ │ │ ├── DGX2_RN50_FP32_50E.sh │ │ │ │ │ └── DGX2_RN50_FP32_90E.sh │ │ │ └── test.qdrep │ │ └── nsys-nvtx.sh │ └── tensorflow │ │ ├── RN50v1.5 │ │ ├── .gitignore │ │ ├── .style.yapf │ │ ├── README.md │ │ ├── baseline.qdrep │ │ ├── dllogger │ │ │ ├── __init__.py │ │ │ ├── autologging.py │ │ │ ├── logger.py │ │ │ └── tags.py │ │ ├── main.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── blocks │ │ │ │ ├── __init__.py │ │ │ │ ├── conv2d_block.py │ │ │ │ └── resnet_bottleneck_block.py │ │ │ ├── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── activation.py │ │ │ │ ├── conv2d.py │ │ │ │ ├── dense.py │ │ │ │ ├── math_ops.py │ │ │ │ ├── normalization.py │ │ │ │ ├── padding.py │ │ │ │ └── pooling.py │ │ │ └── resnet_v1_5.py │ │ ├── requirements.txt │ │ ├── results │ │ │ ├── events.out.tfevents.1566195554.5b8c84c05f4e │ │ │ ├── model.ckpt-1000.index │ │ │ └── model.ckpt-2000.data-00001-of-00002 │ │ ├── runtime │ │ │ ├── __init__.py │ │ │ ├── runner.py │ │ │ └── runner_utils.py │ │ ├── scripts │ │ │ ├── RN50_FP16_16GPU.sh │ │ │ ├── RN50_FP16_1GPU.sh │ │ │ ├── RN50_FP16_4GPU.sh │ │ │ ├── RN50_FP16_8GPU.sh │ │ │ ├── RN50_FP16_EVAL.sh │ │ │ ├── RN50_FP32_16GPU.sh │ │ │ ├── RN50_FP32_1GPU.sh │ │ │ ├── RN50_FP32_4GPU.sh │ │ │ ├── RN50_FP32_8GPU.sh │ │ │ ├── RN50_FP32_EVAL.sh │ │ │ ├── benchmarking │ │ │ │ ├── DGX1V_inferbench_fp16.sh │ │ │ │ ├── DGX1V_inferbench_fp32.sh │ │ │ │ ├── DGX1V_trainbench_fp16.sh │ │ │ │ ├── DGX1V_trainbench_fp32.sh │ │ │ │ ├── DGX2_inferbench_fp16.sh │ │ │ │ ├── DGX2_inferbench_fp32.sh │ │ │ │ ├── DGX2_trainbench_fp16.sh │ │ │ │ ├── DGX2_trainbench_fp32.sh │ │ │ │ ├── baselines │ │ │ │ │ ├── DGX1V_RN50_tensorflow_infer_fp16.json │ │ │ │ │ ├── DGX1V_RN50_tensorflow_infer_fp32.json │ │ │ │ │ ├── DGX1V_RN50_tensorflow_train_fp16.json │ │ │ │ │ ├── DGX1V_RN50_tensorflow_train_fp32.json │ │ │ │ │ ├── DGX2_RN50_tensorflow_infer_fp16.json │ │ │ │ │ ├── DGX2_RN50_tensorflow_infer_fp32.json │ │ │ │ │ ├── DGX2_RN50_tensorflow_train_fp16.json │ │ │ │ │ └── DGX2_RN50_tensorflow_train_fp32.json │ │ │ │ └── benchmark.py │ │ │ └── docker │ │ │ │ ├── build.sh │ │ │ │ └── interactive.sh │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── cmdline_helper.py │ │ │ ├── dali_utils.py │ │ │ ├── data_utils.py │ │ │ ├── hooks │ │ │ ├── __init__.py │ │ │ ├── benchmark_hooks.py │ │ │ ├── prefill_hook.py │ │ │ └── training_hooks.py │ │ │ ├── hvd_utils.py │ │ │ ├── image_processing.py │ │ │ ├── learning_rate.py │ │ │ ├── optimizers.py │ │ │ └── var_storage.py │ │ ├── nsys-nvtx-2g.sh │ │ └── nsys-nvtx.sh │ └── Makefile ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore all 2 | * 3 | 4 | # Unignore all with extensions 5 | !*.* 6 | 7 | # Unignore all dirs 8 | !*/ 9 | 10 | ### Above combination will ignore all files without extension ### 11 | 12 | *.o 13 | *.nvvp 14 | *.pyc 15 | .DS_Store 16 | __pycache__ 17 | 18 | !Makefile 19 | -------------------------------------------------------------------------------- /Chapter01/01_cuda_introduction/01_hello_world/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=hello_world 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | hello_world: hello_world.cu 19 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 20 | 21 | clean: 22 | rm -f ${TARGET} 23 | -------------------------------------------------------------------------------- /Chapter01/01_cuda_introduction/01_hello_world/hello_world.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void print_from_gpu(void) { 5 | printf("Hello World! from thread [%d,%d] \ 6 | From device\n", threadIdx.x,blockIdx.x); 7 | } 8 | 9 | int main(void) { 10 | printf("Hello World from host!\n"); 11 | print_from_gpu<<<1,1>>>(); 12 | cudaDeviceSynchronize(); 13 | return 0; 14 | } 15 | 16 | -------------------------------------------------------------------------------- /Chapter01/01_cuda_introduction/02_vector_addition/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=vector_addition vector_addition_blocks vector_addition_threads vector_addition_threads_blocks 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | vector_addition: vector_addition.cu 21 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 22 | 23 | vector_addition_blocks: vector_addition_gpu_block_only.cu 24 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 25 | 26 | vector_addition_threads: vector_addition_gpu_thread_only.cu 27 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 28 | 29 | vector_addition_threads_blocks: vector_addition_gpu_thread_block.cu 30 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 31 | 32 | clean: 33 | rm -f ${TARGET} 34 | -------------------------------------------------------------------------------- /Chapter01/01_cuda_introduction/02_vector_addition/vector_addition.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define N 512 5 | 6 | void host_add(int *a, int *b, int *c) { 7 | for(int idx=0;idx 2 | #include 3 | 4 | #define N 512 5 | 6 | void host_add(int *a, int *b, int *c) { 7 | for(int idx=0;idx>>(d_a,d_b,d_c); 49 | 50 | // Copy result back to host 51 | cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); 52 | 53 | print_output(a,b,c); 54 | 55 | free(a); free(b); free(c); 56 | cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 57 | 58 | 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /Chapter01/01_cuda_introduction/02_vector_addition/vector_addition_gpu_thread_only.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define N 512 5 | 6 | void host_add(int *a, int *b, int *c) { 7 | for(int idx=0;idx>>(d_a,d_b,d_c); 49 | 50 | // Copy result back to host 51 | cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); 52 | 53 | print_output(a,b,c); 54 | 55 | free(a); free(b); free(c); 56 | cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 57 | 58 | 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /Chapter01/01_cuda_introduction/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | .PHONY: clean 23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 24 | 25 | test: 26 | echo $(DIRECTORY) 27 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/01_sgemm/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | sgemm: sgemm.cu 21 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 22 | 23 | clean: 24 | rm -f ${TARGET} 25 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/02_vector_addition/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=vector_addition_gpu_thread_block 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | vector_addition_gpu_thread_block: vector_addition_gpu_thread_block.cu 21 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 22 | 23 | clean: 24 | rm -f ${TARGET} 25 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/03_aos_soa/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=aos soa 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | aos: aos.cu 21 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 22 | 23 | soa: soa.cu 24 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 25 | 26 | clean: 27 | rm -f ${TARGET} 28 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/03_aos_soa/aos.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | #define NUM_THREADS 256 8 | 9 | #define IMG_SIZE 1048576 10 | 11 | // Coefficients with Array of Structure 12 | struct Coefficients_AOS { 13 | int r; 14 | int b; 15 | int g; 16 | int hue; 17 | int saturation; 18 | int maxVal; 19 | int minVal; 20 | int finalVal; 21 | }; 22 | 23 | 24 | __global__ 25 | void complicatedCalculation(Coefficients_AOS* data) 26 | { 27 | int i = blockIdx.x*blockDim.x + threadIdx.x; 28 | 29 | 30 | int grayscale = (data[i].r + data[i].g + data[i].b)/data[i].maxVal; 31 | int hue_sat = data[i].hue * data[i].saturation / data[i].minVal; 32 | data[i].finalVal = grayscale*hue_sat; 33 | } 34 | 35 | void complicatedCalculation() 36 | { 37 | 38 | Coefficients_AOS* d_x; 39 | 40 | cudaMalloc(&d_x, IMG_SIZE*sizeof(Coefficients_AOS)); 41 | 42 | int num_blocks = IMG_SIZE/NUM_THREADS; 43 | 44 | complicatedCalculation<<>>(d_x); 45 | 46 | cudaFree(d_x); 47 | } 48 | 49 | 50 | 51 | int main(int argc, char*argv[]) 52 | { 53 | 54 | complicatedCalculation(); 55 | return 0; 56 | } 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/03_aos_soa/soa.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | #define NUM_THREADS 256 8 | 9 | #define IMG_SIZE 1048576 10 | 11 | // Coefficients with Structure of Array 12 | struct Coefficients_SOA { 13 | int* r; 14 | int* b; 15 | int* g; 16 | int* hue; 17 | int* saturation; 18 | int* maxVal; 19 | int* minVal; 20 | int* finalVal; 21 | }; 22 | 23 | 24 | __global__ 25 | void complicatedCalculation(Coefficients_SOA data) 26 | { 27 | int i = blockIdx.x*blockDim.x + threadIdx.x; 28 | int grayscale = (data.r[i] + data.g[i] + data.b[i])/data.maxVal[i]; 29 | int hue_sat = data.hue[i] * data.saturation[i] / data.minVal[i]; 30 | 31 | data.finalVal[i] = grayscale*hue_sat; 32 | } 33 | 34 | void complicatedCalculation() 35 | { 36 | 37 | Coefficients_SOA d_x; 38 | 39 | cudaMalloc(&d_x.r, IMG_SIZE*sizeof(int)); 40 | cudaMalloc(&d_x.g, IMG_SIZE*sizeof(int)); 41 | cudaMalloc(&d_x.b, IMG_SIZE*sizeof(int)); 42 | cudaMalloc(&d_x.hue, IMG_SIZE*sizeof(int)); 43 | cudaMalloc(&d_x.saturation, IMG_SIZE*sizeof(int)); 44 | cudaMalloc(&d_x.maxVal, IMG_SIZE*sizeof(int)); 45 | cudaMalloc(&d_x.minVal, IMG_SIZE*sizeof(int)); 46 | cudaMalloc(&d_x.finalVal, IMG_SIZE*sizeof(int)); 47 | 48 | int num_blocks = IMG_SIZE/NUM_THREADS; 49 | 50 | complicatedCalculation<<>>(d_x); 51 | 52 | cudaFree(d_x.r); 53 | cudaFree(d_x.g); 54 | cudaFree(d_x.b); 55 | cudaFree(d_x.hue); 56 | cudaFree(d_x.saturation); 57 | cudaFree(d_x.maxVal); 58 | cudaFree(d_x.maxVal); 59 | cudaFree(d_x.minVal); 60 | cudaFree(d_x.finalVal); 61 | } 62 | 63 | 64 | 65 | int main(int argc, char*argv[]) 66 | { 67 | 68 | complicatedCalculation(); 69 | return 0; 70 | } 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/04_matrix_transpose/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=matrix_transpose conflict_solved 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | matrix_transpose: matrix_transpose.cu 21 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 22 | 23 | conflict_solved: conflict_solved.cu 24 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 25 | 26 | clean: 27 | rm -f ${TARGET} 28 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/05_image_scaling/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=image_scaling 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | INCS = scrImagePgmPpmPackage.h 21 | 22 | scrImagePgmPpmPackage.o: scrImagePgmPpmPackage.cpp ${INCS} 23 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 24 | 25 | image_scaling.o: image_scaling.cu ${INCS} 26 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 27 | 28 | image_scaling: scrImagePgmPpmPackage.o image_scaling.o 29 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 30 | 31 | clean: 32 | rm -f ${TARGET} 33 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/05_image_scaling/aerosmith-double.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter02/02_memory_overview/05_image_scaling/aerosmith-double.pgm -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/05_image_scaling/scrImagePgmPpmPackage.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | 5 | int scr_read_pgm( char* name, unsigned char* image, int irows, int icols ); 6 | void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment ); 7 | int scr_read_ppm( char* name, unsigned char* image, int irows, int icols ); 8 | void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment ); 9 | void get_PgmPpmParams(char * , int *, int *); 10 | void getout_comment(FILE * ); 11 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/05_image_scaling/voyager2.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter02/02_memory_overview/05_image_scaling/voyager2.pgm -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/06_unified_memory/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=unified_simple unified_initialized unified_prefetch unified_64align 5 | 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all : ${TARGET} 19 | 20 | unified_simple: unified_memory.cu 21 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $< 22 | 23 | unified_initialized: unified_memory_initialized.cu 24 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $< 25 | 26 | unified_prefetch: unified_memory_prefetch.cu 27 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $< 28 | 29 | unified_64align: unified_memory_64align.cu 30 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $< 31 | 32 | clean: 33 | rm -f *.out 34 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/06_unified_memory/unified_memory.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA kernel to add elements of two arrays 5 | __global__ 6 | void add(int n, float *x, float *y) 7 | { 8 | int index = blockIdx.x * blockDim.x + threadIdx.x; 9 | int stride = blockDim.x * gridDim.x; 10 | for (int i = index; i < n; i += stride) 11 | y[i] = x[i] + y[i]; 12 | } 13 | 14 | int main(void) 15 | { 16 | int N = 1<<20; 17 | float *x, *y; 18 | 19 | // Allocate Unified Memory -- accessible from CPU or GPU 20 | cudaMallocManaged(&x, N*sizeof(float)); 21 | cudaMallocManaged(&y, N*sizeof(float)); 22 | 23 | // initialize x and y arrays on the host 24 | for (int i = 0; i < N; i++) { 25 | x[i] = 1.0f; 26 | y[i] = 2.0f; 27 | } 28 | 29 | // Launch kernel on 1M elements on the GPU 30 | int blockSize = 256; 31 | int numBlocks = (N + blockSize - 1) / blockSize; 32 | add<<>>(N, x, y); 33 | 34 | // Wait for GPU to finish before accessing on host 35 | cudaDeviceSynchronize(); 36 | 37 | // Check for errors (all values should be 3.0f) 38 | float maxError = 0.0f; 39 | for (int i = 0; i < N; i++) 40 | maxError = fmax(maxError, fabs(y[i]-3.0f)); 41 | std::cout << "Max error: " << maxError << std::endl; 42 | 43 | // Free memory 44 | cudaFree(x); 45 | cudaFree(y); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/06_unified_memory/unified_memory_initialized.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void init(int n, float *x, float *y) { 5 | int index = threadIdx.x + blockIdx.x * blockDim.x; 6 | int stride = blockDim.x * gridDim.x; 7 | for (int i = index; i < n; i += stride) { 8 | x[i] = 1.0f; 9 | y[i] = 2.0f; 10 | } 11 | } 12 | 13 | // CUDA kernel to add elements of two arrays 14 | __global__ 15 | void add(int n, float *x, float *y) 16 | { 17 | int index = blockIdx.x * blockDim.x + threadIdx.x; 18 | int stride = blockDim.x * gridDim.x; 19 | for (int i = index; i < n; i += stride) 20 | y[i] = x[i] + y[i]; 21 | } 22 | 23 | int main(void) 24 | { 25 | int N = 1<<20; 26 | float *x, *y; 27 | 28 | // Allocate Unified Memory -- accessible from CPU or GPU 29 | cudaMallocManaged(&x, N*sizeof(float)); 30 | cudaMallocManaged(&y, N*sizeof(float)); 31 | 32 | // Launch kernel on 1M elements on the GPU 33 | int blockSize = 256; 34 | int numBlocks = (N + blockSize - 1) / blockSize; 35 | init<<>>(N, x, y); 36 | add<<>>(N, x, y); 37 | 38 | // Wait for GPU to finish before accessing on host 39 | cudaDeviceSynchronize(); 40 | 41 | // Check for errors (all values should be 3.0f) 42 | float maxError = 0.0f; 43 | for (int i = 0; i < N; i++) 44 | maxError = fmax(maxError, fabs(y[i]-3.0f)); 45 | std::cout << "Max error: " << maxError << std::endl; 46 | 47 | // Free memory 48 | cudaFree(x); 49 | cudaFree(y); 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/06_unified_memory/unified_memory_prefetch.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA kernel to add elements of two arrays 5 | __global__ 6 | void add(int n, float *x, float *y) 7 | { 8 | int index = blockIdx.x * blockDim.x + threadIdx.x; 9 | int stride = blockDim.x * gridDim.x; 10 | for (int i = index; i < n; i += stride) 11 | y[i] = x[i] + y[i]; 12 | } 13 | 14 | int main(void) 15 | { 16 | int N = 1<<20; 17 | float *x, *y; 18 | int device = -1; 19 | 20 | // Allocate Unified Memory -- accessible from CPU or GPU 21 | cudaMallocManaged(&x, N*sizeof(float)); 22 | cudaMallocManaged(&y, N*sizeof(float)); 23 | 24 | // initialize x and y arrays on the host 25 | for (int i = 0; i < N; i++) { 26 | x[i] = 1.0f; 27 | y[i] = 2.0f; 28 | } 29 | 30 | cudaGetDevice(&device); 31 | // GPU prefetches unified memory memory 32 | cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL); 33 | cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL); 34 | 35 | // Launch kernel on 1M elements on the GPU 36 | int blockSize = 256; 37 | int numBlocks = (N + blockSize - 1) / blockSize; 38 | add<<>>(N, x, y); 39 | // Host prefecthes Memory 40 | cudaMemPrefetchAsync(y, N*sizeof(float), cudaCpuDeviceId, NULL); 41 | // Wait for GPU to finish before accessing on host 42 | cudaDeviceSynchronize(); 43 | 44 | // Check for errors (all values should be 3.0f) 45 | float maxError = 0.0f; 46 | for (int i = 0; i < N; i++) 47 | maxError = fmax(maxError, fabs(y[i]-3.0f)); 48 | std::cout << "Max error: " << maxError << std::endl; 49 | 50 | // Free memory 51 | cudaFree(x); 52 | cudaFree(y); 53 | 54 | return 0; 55 | } 56 | -------------------------------------------------------------------------------- /Chapter02/02_memory_overview/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | .PHONY: clean 23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 24 | 25 | test: 26 | echo $(DIRECTORY) 27 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/01_warp_and_thread_block/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | 5 | INCLUDES= -I${CUDA_PATH}/samples/common/inc 6 | NVCC_FLAGS=-m64 -lineinfo 7 | 8 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 9 | 10 | # Gencode argumentes 11 | SMS = 35 37 50 52 60 61 70 75 12 | ifeq "$(IS_CUDA_11)" "1" 13 | SMS = 52 60 61 70 75 80 14 | endif 15 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 16 | 17 | cuda_thread_block: cuda_thread_block.cu 18 | ${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $< 19 | 20 | clean: 21 | rm -f cuda_thread_block 22 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/01_warp_and_thread_block/cuda_thread_block.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /** 5 | * In this section, we will discover concurrent operation in CUDA 6 | * 1) blocks in grid: concurrent tasks, no gurantee their order of execution (no synchronization) 7 | * 2) warp in blocks: concurrent threads, explicitly synchronizable (it will be discussed in next section) 8 | * 3) thread in warp: implicitly synchronized 9 | */ 10 | 11 | __global__ void idx_print() 12 | { 13 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 14 | int warp_idx = threadIdx.x / warpSize; 15 | int lane_idx = threadIdx.x & (warpSize - 1); 16 | 17 | if ((lane_idx & (warpSize/2 - 1)) == 0) 18 | // thread, block, warp, lane" 19 | printf(" %5d\t%5d\t %2d\t%2d\n", idx, blockIdx.x, warp_idx, lane_idx); 20 | } 21 | 22 | int main(int argc, char* argv[]) 23 | { 24 | if (argc == 1) { 25 | puts("Please put Block Size and Thread Block Size.."); 26 | puts("./cuda_thread_block [grid size] [block size]"); 27 | puts("e.g.) ./cuda_thread_block 4 128"); 28 | 29 | exit(1); 30 | } 31 | 32 | int gridSize = atoi(argv[1]); 33 | int blockSize = atoi(argv[2]); 34 | 35 | puts("thread, block, warp, lane"); 36 | idx_print<<>>(); 37 | cudaDeviceSynchronize(); 38 | } 39 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/02_cuda_occupancy/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all: ${TARGET} 19 | 20 | # SEGMM 21 | sgemm: sgemm.cu 22 | $(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 23 | 24 | clean: 25 | rm -f ${TARGET} *.o 26 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction_global reduction_shared 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += 20 | ALL_CCFLAGS += 21 | 22 | all : ${TARGET} 23 | 24 | 25 | reduction_global_kernel.o: reduction_global_kernel.cu 26 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 27 | 28 | reduction_global.o: reduction_global.cpp 29 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 30 | 31 | reduction_global: reduction_global.o reduction_global_kernel.o 32 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 33 | 34 | reduction_shared_kernel.o: reduction_shared_kernel.cu 35 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 36 | 37 | reduction_shared.o: reduction_shared.cpp 38 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 39 | 40 | reduction_shared: reduction_shared.o reduction_shared_kernel.o 41 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 42 | 43 | clean: 44 | rm -f ${TARGET} *.o *.nvvp 45 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_kernel.cu 5 | void reduction(float *d_out, float *d_in, int n_threads, int size); 6 | 7 | // @naive_reduction_kernel.cu 8 | void global_reduction(float *d_out, float *d_in, int n_threads, int size); 9 | // void atomic_reduction(float *d_out, float *d_in, int n_threads, int size); 10 | 11 | #endif // _REDUCTION_H_ 12 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/reduction_global_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void 5 | global_reduction_kernel(float *data_out, float *data_in, int stride, int size) 6 | { 7 | int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 8 | 9 | if (idx_x + stride < size) { 10 | data_out[idx_x] += data_in[idx_x + stride]; 11 | } 12 | } 13 | 14 | void global_reduction(float *d_out, float *d_in, int n_threads, int size) 15 | { 16 | int n_blocks = (size + n_threads - 1) / n_threads; 17 | for (int stride = 1; stride < size; stride *= 2) { 18 | global_reduction_kernel<<>>(d_out, d_in, stride, size); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/reduction_shared_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "reduction.h" 3 | 4 | /* 5 | Parallel sum reduction using shared memory 6 | - takes log(n) steps for n input elements 7 | - uses n threads 8 | - only works for power-of-2 arrays 9 | */ 10 | 11 | // cuda thread synchronization 12 | __global__ void 13 | reduction_kernel(float* d_out, float* d_in, unsigned int size) 14 | { 15 | unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | extern __shared__ float s_data[]; 18 | 19 | s_data[threadIdx.x] = (idx_x < size) ? d_in[idx_x] : 0.f; 20 | 21 | __syncthreads(); 22 | 23 | // do reduction 24 | for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 25 | { 26 | // thread synchronous reduction 27 | if ( (idx_x % (stride * 2)) == 0 ) 28 | s_data[threadIdx.x] += s_data[threadIdx.x + stride]; 29 | 30 | __syncthreads(); 31 | } 32 | 33 | if (threadIdx.x == 0) 34 | d_out[blockIdx.x] = s_data[0]; 35 | } 36 | 37 | void reduction(float *d_out, float *d_in, int n_threads, int size) 38 | { 39 | cudaMemcpy(d_out, d_in, size * sizeof(float), cudaMemcpyDeviceToDevice); 40 | while(size > 1) 41 | { 42 | int n_blocks = (size + n_threads - 1) / n_threads; 43 | reduction_kernel<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(d_out, d_out, size); 44 | size = n_blocks; 45 | } 46 | } -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/04_performance_limiter/Makefile: -------------------------------------------------------------------------------- 1 | # CUDA_PATH=${CUDA_ROOT} 2 | CUDA_PATH=/usr/local/cuda 3 | HOST_COMPILER ?= g++ 4 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 5 | TARGET=sgemm 6 | 7 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 8 | NVCC_FLAGS=-m64 -lineinfo 9 | 10 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 11 | 12 | # Gencode argumentes 13 | SMS = 35 37 50 52 60 61 70 75 14 | ifeq "$(IS_CUDA_11)" "1" 15 | SMS = 52 60 61 70 75 80 16 | endif 17 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 18 | 19 | ALL_CCFLAGS += $(NVCC_FLAGS) 20 | 21 | all: ${TARGET} 22 | 23 | # SEGMM 24 | sgemm: sgemm.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 26 | 27 | clean: 28 | rm -f ${TARGET} *.o 29 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/04_performance_limiter/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_kernel.cu 5 | void reduction(float *d_out, float *d_in, int n_threads, int size); 6 | 7 | // @naive_reduction_kernel.cu 8 | void global_reduction(float *d_out, float *d_in, int n_threads, int size); 9 | // void atomic_reduction(float *d_out, float *d_in, int n_threads, int size); 10 | 11 | #endif // _REDUCTION_H_ 12 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/04_performance_limiter/reduction_shared_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "reduction.h" 3 | 4 | /* 5 | Parallel sum reduction using shared memory 6 | - takes log(n) steps for n input elements 7 | - uses n threads 8 | - only works for power-of-2 arrays 9 | */ 10 | 11 | // cuda thread synchronization 12 | __global__ void 13 | reduction_kernel(float* d_out, float* d_in, unsigned int size) 14 | { 15 | unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | extern __shared__ float s_data[]; 18 | 19 | s_data[threadIdx.x] = (idx_x < size) ? d_in[idx_x] : 0.f; 20 | 21 | __syncthreads(); 22 | 23 | // do reduction 24 | for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 25 | { 26 | // thread synchronous reduction 27 | // to reduce the compute utilization, we can switch the operation 28 | // if ( (idx_x % (stride * 2)) == 0 ) // 0.433 ms 29 | if ( (idx_x & (stride * 2 - 1)) == 0 ) // 0.399 ms 30 | s_data[threadIdx.x] += s_data[threadIdx.x + stride]; 31 | 32 | __syncthreads(); 33 | } 34 | 35 | if (threadIdx.x == 0) 36 | d_out[blockIdx.x] = s_data[0]; 37 | } 38 | 39 | void reduction(float *d_out, float *d_in, int n_threads, int size) 40 | { 41 | cudaMemcpy(d_out, d_in, size * sizeof(float), cudaMemcpyDeviceToDevice); 42 | while(size > 1) 43 | { 44 | int n_blocks = (size + n_threads - 1) / n_threads; 45 | reduction_kernel<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(d_out, d_out, size); 46 | size = n_blocks; 47 | } 48 | } -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/05_warp_divergence/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction_sequential reduction_interleaving 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true ${NVCC_FLAGS} 21 | 22 | all : ${TARGET} 23 | 24 | reduction_kernel_interleaving.o: reduction_kernel_interleaving.cu 25 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 26 | 27 | reduction_kernel_sequential.o: reduction_kernel_sequential.cu 28 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 29 | 30 | reduction.o: reduction.cpp 31 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 32 | 33 | reduction_sequential: reduction.o reduction_kernel_sequential.o 34 | $(EXEC) $(NVCC) ${INCLUDES} -o $@ $(ALL_CCFLAGS) $(GENCODE_FLAGS) $(LIBRARIES) $+ 35 | 36 | reduction_interleaving: reduction.o reduction_kernel_interleaving.o 37 | $(EXEC) $(NVCC) ${INCLUDES} -o $@ $(ALL_CCFLAGS) $(GENCODE_FLAGS) $(LIBRARIES) $+ 38 | 39 | clean: 40 | rm -f ${TARGET} *.o 41 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/05_warp_divergence/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @ calling the reduction kernel 5 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads); 6 | 7 | #define max(a, b) (a) > (b) ? (a) : (b) 8 | 9 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/05_warp_divergence/reduction_kernel_interleaving.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "reduction.h" 3 | 4 | /* 5 | Parallel sum reduction using shared memory 6 | - takes log(n) steps for n input elements 7 | - uses n threads 8 | - only works for power-of-2 arrays 9 | */ 10 | 11 | // cuda thread synchronization 12 | __global__ void 13 | reduction_kernel_1(float* g_out, float* g_in, unsigned int size) 14 | { 15 | unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | extern __shared__ float s_data[]; 18 | 19 | s_data[threadIdx.x] = (idx_x < size) ? g_in[idx_x] : 0.f; 20 | 21 | __syncthreads(); 22 | 23 | // do reduction 24 | // interleaved addressing 25 | for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 26 | { 27 | int index = 2 * stride * threadIdx.x; 28 | 29 | if (index < blockDim.x) 30 | s_data[index] += s_data[index + stride]; 31 | 32 | __syncthreads(); 33 | } 34 | 35 | if (threadIdx.x == 0) 36 | g_out[blockIdx.x] = s_data[0]; 37 | } 38 | 39 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads) 40 | { 41 | int n_blocks = (size + n_threads - 1) / n_threads; 42 | reduction_kernel_1<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(g_outPtr, g_inPtr, size); 43 | return n_blocks; 44 | } 45 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/05_warp_divergence/reduction_kernel_sequential.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "reduction.h" 3 | 4 | /* 5 | Parallel sum reduction using shared memory 6 | - takes log(n) steps for n input elements 7 | - uses n threads 8 | - only works for power-of-2 arrays 9 | */ 10 | __global__ void 11 | reduction_kernel_2(float *g_out, float *g_in, unsigned int size) 12 | { 13 | unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 14 | 15 | extern __shared__ float s_data[]; 16 | 17 | s_data[threadIdx.x] = (idx_x < size) ? g_in[idx_x] : 0.f; 18 | 19 | __syncthreads(); 20 | 21 | // do reduction 22 | // sequential addressing 23 | for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) 24 | { 25 | if (threadIdx.x < stride) 26 | s_data[threadIdx.x] += s_data[threadIdx.x + stride]; 27 | 28 | __syncthreads(); 29 | } 30 | 31 | if (threadIdx.x == 0) 32 | g_out[blockIdx.x] = s_data[0]; 33 | } 34 | 35 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads) 36 | { 37 | int n_blocks = (size + n_threads - 1) / n_threads; 38 | reduction_kernel_2<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(g_outPtr, g_inPtr, size); 39 | return n_blocks; 40 | } -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/06_limiter_balancing/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true 21 | 22 | all : ${TARGET} 23 | 24 | reduction_kernel.o: reduction_kernel.cu 25 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 26 | 27 | reduction.o: reduction.cpp 28 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 29 | 30 | reduction: reduction.o reduction_kernel.o 31 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 32 | 33 | clean: 34 | rm -f ${TARGET} *.o 35 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/06_limiter_balancing/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_kernel.cu 5 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads); 6 | 7 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/06_limiter_balancing/reduction_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "reduction.h" 3 | 4 | /* 5 | Parallel sum reduction using shared memory 6 | - takes log(n) steps for n input elements 7 | - uses n threads 8 | - only works for power-of-2 arrays 9 | */ 10 | __global__ void 11 | reduction_kernel(float *g_out, float *g_in, unsigned int size) 12 | { 13 | unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 14 | 15 | extern __shared__ float s_data[]; 16 | 17 | // cumulates input with grid-stride loop and save to share memory 18 | float input = 0.f; 19 | for (int i = idx_x; i < size; i += blockDim.x * gridDim.x) 20 | input += g_in[i]; 21 | s_data[threadIdx.x] = input; 22 | 23 | __syncthreads(); 24 | 25 | // do reduction 26 | for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) 27 | { 28 | if (threadIdx.x < stride) 29 | s_data[threadIdx.x] += s_data[threadIdx.x + stride]; 30 | 31 | __syncthreads(); 32 | } 33 | 34 | if (threadIdx.x == 0) { 35 | g_out[blockIdx.x] = s_data[0]; 36 | } 37 | } 38 | 39 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads) 40 | { 41 | int num_sms; 42 | int num_blocks_per_sm; 43 | cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0); 44 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, reduction_kernel, n_threads, n_threads*sizeof(float)); 45 | int n_blocks = min(num_blocks_per_sm * num_sms, (size + n_threads - 1) / n_threads); 46 | 47 | reduction_kernel<<>>(g_outPtr, g_inPtr, size); 48 | reduction_kernel<<<1, n_threads, n_threads * sizeof(float), 0>>>(g_outPtr, g_outPtr, n_blocks); 49 | 50 | return 1; 51 | } 52 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/07_warp_synchronous_programming/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction_wp 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) 21 | 22 | all : ${TARGET} 23 | 24 | reduction_wp_kernel.o: reduction_wp_kernel.cu 25 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 26 | 27 | reduction.o: reduction.cpp 28 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 29 | 30 | reduction_wp: reduction.o reduction_wp_kernel.o 31 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 32 | 33 | clean: 34 | rm -f ${TARGET} *.o 35 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/07_warp_synchronous_programming/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_kernel.cu 5 | void reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads); 6 | 7 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/08_cooperative_group/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction_cg reduction_cg_shift 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true 21 | 22 | all : ${TARGET} 23 | 24 | reduction.o: reduction.cpp 25 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 26 | 27 | reduction_cg_kernel.o: reduction_cg_kernel.cu 28 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 29 | 30 | reduction_cg: reduction.o reduction_cg_kernel.o 31 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 32 | 33 | reduction_cg_shift_kernel.o: reduction_cg_shift_kernel.cu 34 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 35 | 36 | reduction_cg_shift: reduction.o reduction_cg_shift_kernel.o 37 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 38 | 39 | clean: 40 | rm -f ${TARGET} *.o 41 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/08_cooperative_group/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_kernel.cu 5 | void reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads); 6 | 7 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/09_loop_unrolling/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction_wp reduction_cg 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo# --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) 21 | 22 | all : ${TARGET} 23 | 24 | reduction_wp_kernel.o: reduction_wp_kernel.cu 25 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 26 | 27 | reduction_cg_kernel.o: reduction_cg_kernel.cu 28 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 29 | 30 | reduction.o: reduction.cpp 31 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 32 | 33 | reduction_wp: reduction.o reduction_wp_kernel.o 34 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 35 | 36 | reduction_cg: reduction.o reduction_cg_kernel.o 37 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 38 | 39 | clean: 40 | rm -f ${TARGET} *.o 41 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/09_loop_unrolling/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_kernel.cu 5 | void reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads); 6 | 7 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/10_atomic_operation/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_wrp_atmc_kernel.cu 5 | // @reduction_blk_atmc_kernel.cu 6 | // @reduction_kernel.cu 7 | void atomic_reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads); 8 | 9 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/10_atomic_operation/reduction_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "reduction.h" 4 | 5 | using namespace cooperative_groups; 6 | 7 | /* 8 | Parallel sum reduction using shared memory 9 | - takes log(n) steps for n input elements 10 | - uses n threads 11 | - only works for power-of-2 arrays 12 | */ 13 | 14 | /** 15 | Two warp level primitives are used here for this example 16 | https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ 17 | https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ 18 | */ 19 | 20 | __global__ void 21 | atomic_reduction_kernel(float *data_out, float *data_in, int size) 22 | { 23 | int idx_x = blockIdx.x * blockDim.x + threadIdx.x; 24 | 25 | atomicAdd(&data_out[0], data_in[idx_x]); 26 | } 27 | 28 | void atomic_reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads) 29 | { 30 | int n_blocks = (size + n_threads - 1) / n_threads; 31 | atomic_reduction_kernel<<>>(g_outPtr, g_inPtr, size); 32 | } 33 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/11_mixed_precision_operation/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=mixed_precision_single mixed_precision_half mixed_precision_int 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | 21 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES) 22 | 23 | all : ${TARGET} 24 | 25 | mixed_precision_half: mixed_precision_half.cu 26 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | mixed_precision_single: mixed_precision.cu 29 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 30 | 31 | mixed_precision_int: mixed_precision_int.cu 32 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 33 | 34 | clean: 35 | rm -f ${TARGET} *.o 36 | -------------------------------------------------------------------------------- /Chapter03/03_cuda_thread_programming/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | .PHONY: clean 23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 24 | 25 | test: 26 | echo $(DIRECTORY) 27 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/01_cuda_stream/1_cuda_default_stream.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | __global__ void 6 | foo_kernel(int step) 7 | { 8 | printf("loop: %d\n", step); 9 | } 10 | 11 | int main() 12 | { 13 | int n_loop = 5; 14 | 15 | // execute kernels with the default stream 16 | for (int i = 0; i < n_loop; i++) 17 | foo_kernel<<< 1, 1, 0, 0 >>>(i); 18 | 19 | cudaDeviceSynchronize(); 20 | 21 | return 0; 22 | } -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/01_cuda_stream/2_cuda_multi_stream.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | __global__ void 6 | foo_kernel(int step) 7 | { 8 | printf("loop: %d\n", step); 9 | } 10 | 11 | int main() 12 | { 13 | int n_stream = 5; 14 | cudaStream_t *ls_stream; 15 | ls_stream = (cudaStream_t*) new cudaStream_t[n_stream]; 16 | 17 | // create multiple streams 18 | for (int i = 0; i < n_stream; i++) 19 | cudaStreamCreate(&ls_stream[i]); 20 | 21 | // execute kernels with the CUDA stream each 22 | for (int i = 0; i < n_stream; i++) 23 | foo_kernel<<< 1, 1, 0, ls_stream[i] >>>(i); 24 | 25 | // synchronize the host and GPU 26 | cudaDeviceSynchronize(); 27 | 28 | // terminates all the created CUDA streams 29 | for (int i = 0; i < n_stream; i++) 30 | cudaStreamDestroy(ls_stream[i]); 31 | delete [] ls_stream; 32 | 33 | return 0; 34 | } -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/01_cuda_stream/3_cuda_multi_stream_with_sync.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | __global__ void 6 | foo_kernel(int step) 7 | { 8 | printf("loop: %d\n", step); 9 | } 10 | 11 | int main() 12 | { 13 | int n_stream = 5; 14 | cudaStream_t *ls_stream; 15 | ls_stream = (cudaStream_t*) new cudaStream_t[n_stream]; 16 | 17 | // create multiple streams 18 | for (int i = 0; i < n_stream; i++) 19 | cudaStreamCreate(&ls_stream[i]); 20 | 21 | // execute kernels with the CUDA stream each 22 | for (int i = 0; i < n_stream; i++) { 23 | foo_kernel<<< 1, 1, 0, ls_stream[i] >>>(i); 24 | cudaStreamSynchronize(ls_stream[i]); 25 | } 26 | 27 | // synchronize the host and GPU 28 | cudaDeviceSynchronize(); 29 | 30 | // terminates all the created CUDA streams 31 | for (int i = 0; i < n_stream; i++) 32 | cudaStreamDestroy(ls_stream[i]); 33 | delete [] ls_stream; 34 | 35 | return 0; 36 | } -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/01_cuda_stream/4_cuda_multi_stream_with_default.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | __global__ void 6 | foo_kernel(int step) 7 | { 8 | printf("loop: %d\n", step); 9 | } 10 | 11 | int main() 12 | { 13 | int n_stream = 5; 14 | cudaStream_t *ls_stream; 15 | ls_stream = (cudaStream_t*) new cudaStream_t[n_stream]; 16 | 17 | // create multiple streams 18 | for (int i = 0; i < n_stream; i++) 19 | cudaStreamCreate(&ls_stream[i]); 20 | 21 | // execute kernels with the CUDA stream each 22 | for (int i = 0; i < n_stream; i++) 23 | if (i == 3) 24 | foo_kernel<<< 1, 1, 0, 0 >>>(i); 25 | else 26 | foo_kernel<<< 1, 1, 0, ls_stream[i] >>>(i); 27 | 28 | // synchronize the host and GPU 29 | cudaDeviceSynchronize(); 30 | 31 | // terminates all the created CUDA streams 32 | for (int i = 0; i < n_stream; i++) 33 | cudaStreamDestroy(ls_stream[i]); 34 | delete [] ls_stream; 35 | 36 | return 0; 37 | } -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/01_cuda_stream/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cuda_default_stream cuda_multi_stream cuda_multi_stream_with_sync cuda_multi_stream_with_default 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += 20 | ALL_CCFLAGS += 21 | 22 | all : ${TARGET} 23 | 24 | cuda_default_stream: 1_cuda_default_stream.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES) 26 | 27 | cuda_multi_stream: 2_cuda_multi_stream.cu 28 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES) 29 | 30 | cuda_multi_stream_with_sync: 3_cuda_multi_stream_with_sync.cu 31 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES) 32 | 33 | cuda_multi_stream_with_default: 4_cuda_multi_stream_with_default.cu 34 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES) 35 | 36 | clean: 37 | rm -f ${TARGET} *.o *.nvvp 38 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/02_pipelining/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cuda_pipelining 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --default-stream per-thread # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += 20 | ALL_CCFLAGS += $(NVCC_FLAGS) 21 | 22 | all : ${TARGET} 23 | 24 | cuda_pipelining: cuda_pipelining.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 26 | 27 | nvprof: cuda_pipelining 28 | nvprof -f -o $+_${STREAMS}.nvvp ./$+ ${STREAMS} 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/03_cuda_callback/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cuda_callback 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 --default-stream per-thread # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += 20 | ALL_CCFLAGS += $(NVCC_FLAGS) 21 | 22 | all : ${TARGET} 23 | 24 | cuda_callback: cuda_callback.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 26 | 27 | nvprof: cuda_callback 28 | nvprof -f -o $+_${STREAMS}.nvvp --cpu-thread-tracing on ./$+ ${STREAMS} 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/04_stream_priority/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=prioritized_cuda_stream 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 --default-stream per-thread # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += 20 | ALL_CCFLAGS += $(NVCC_FLAGS) 21 | 22 | all : ${TARGET} 23 | 24 | prioritized_cuda_stream: prioritized_cuda_stream.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 26 | 27 | nvprof: prioritized_cuda_stream 28 | nvprof -f -o $+_${STREAMS}.nvvp --cpu-thread-tracing on ./$+ ${STREAMS} 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/05_cuda_event/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cuda_event cuda_event_with_streams 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS= #-m64 --default-stream per-thread # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += $(NVCC_FLAGS) -g -Xcompiler -fopenmp -rdc=true 21 | 22 | # Openmp 23 | # LIBRARIES += -lgomp 24 | # ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true 25 | 26 | all : ${TARGET} 27 | 28 | cuda_event: cuda_event.cu 29 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 30 | 31 | cuda_event_with_streams: cuda_event_with_streams.cu 32 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 33 | 34 | clean: 35 | rm -f ${TARGET} *.o *.nvvp 36 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/06_dynamic_parallelism/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=dynamic_parallelism recursion 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -rdc=true -lcudadevrt # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += 19 | ALL_CCFLAGS += ${NVCC_FLAGS} 20 | 21 | all : ${TARGET} 22 | 23 | dynamic_parallelism: dynamic_parallelism.cu 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 25 | 26 | recursion: recursion.cu 27 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 28 | 29 | nvprof: dynamic_parallelism 30 | nvprof -f -o $+.nvvp ./$+ 31 | 32 | clean: 33 | rm -f ${TARGET} *.o *.nvvp 34 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/06_dynamic_parallelism/dynamic_parallelism.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | #define BUF_SIZE (1 << 10) 8 | #define BLOCKDIM 256 9 | 10 | __global__ void child_kernel(int *data, int seed) 11 | { 12 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 13 | 14 | atomicAdd(&data[idx], seed); 15 | } 16 | 17 | __global__ void parent_kernel(int *data) 18 | { 19 | if (threadIdx.x == 0) 20 | { 21 | int child_size = BUF_SIZE/gridDim.x; 22 | child_kernel<<< child_size/BLOCKDIM, BLOCKDIM >>>(&data[child_size*blockIdx.x], blockIdx.x+1); 23 | } 24 | // synchronization for other parent's kernel output 25 | cudaDeviceSynchronize(); 26 | } 27 | 28 | int main() 29 | { 30 | int *data; 31 | int num_child = 2; 32 | 33 | cudaMallocManaged((void**)&data, BUF_SIZE * sizeof(int)); 34 | cudaMemset(data, 0, BUF_SIZE * sizeof(int)); 35 | 36 | parent_kernel<<>>(data); 37 | 38 | cudaDeviceSynchronize(); 39 | 40 | // Count elements value 41 | int counter = 0; 42 | for (int i = 0; i < BUF_SIZE; i++) { 43 | counter += data[i]; 44 | } 45 | 46 | // getting answer 47 | int counter_h = 0; 48 | for (int i = 0; i < num_child; i++) { 49 | counter_h += (i+1); 50 | } 51 | counter_h *= BUF_SIZE / num_child; 52 | 53 | if (counter_h == counter) 54 | printf("Correct!!\n"); 55 | else 56 | printf("Error!! Obtained %d. It should be %d\n", counter, counter_h); 57 | 58 | cudaFree(data); 59 | 60 | return 0; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/07_grid_level_cg/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=reduction 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true 21 | 22 | all : ${TARGET} 23 | 24 | reduction_kernel.o: reduction_kernel.cu 25 | $(EXEC) $(NVCC) $(NVCC_FLAGS) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 26 | 27 | reduction.o: reduction.cpp 28 | $(EXEC) $(NVCC) $(NVCC_FLAGS) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 29 | 30 | reduction: reduction.o reduction_kernel.o 31 | $(EXEC) $(NVCC) $(NVCC_FLAGS) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 32 | 33 | nvprof: reduction 34 | nvprof -f -o $+.nvvp --cpu-thread-tracing on ./$+ 35 | nvprof -f -o $+-metric.nvvp --analysis-metrics ./$+ 36 | 37 | clean: 38 | rm -f ${TARGET} *.o 39 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/07_grid_level_cg/reduction.h: -------------------------------------------------------------------------------- 1 | #ifndef _REDUCTION_H_ 2 | #define _REDUCTION_H_ 3 | 4 | // @reduction_loop_kernel.cu 5 | int reduction_grid_sync(float *g_outPtr, float *g_inPtr, int size, int n_threads); 6 | 7 | #endif // _REDUCTION_H_ -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/08_openmp_cuda/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=openmp openmp_default_stream openmp_gpus 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true 21 | 22 | all : ${TARGET} 23 | 24 | openmp: openmp.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 26 | 27 | openmp_gpus: openmp_gpus.cu 28 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 29 | 30 | openmp_default_stream: openmp_default_stream.cu 31 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 32 | 33 | nvprof: openmp 34 | nvprof -f -o $+.nvvp --cpu-thread-tracing on ./$+ 35 | 36 | n_ops: openmp_gpus 37 | nvprof -f -o $+_${STREAMS}.nvvp --cpu-thread-tracing on ./$+ ${STREAMS} 38 | 39 | clean: 40 | rm -f ${TARGET} *.o *.nvvp 41 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/09_mps/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | MPICC ?= mpicc 5 | TARGET=simpleMPI 6 | 7 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I/usr/local/include/ 8 | NVCC_FLAGS=-m64 -Xcompiler -fopenmp -rdc=true -lcudadevrt -lmpi # --resource-usage 9 | 10 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 11 | 12 | # Gencode argumentes 13 | SMS = 35 37 50 52 60 61 70 75 14 | ifeq "$(IS_CUDA_11)" "1" 15 | SMS = 52 60 61 70 75 80 16 | endif 17 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 18 | 19 | LIBRARIES += -lgomp 20 | ALL_CCFLAGS += ${NVCC_FLAGS} 21 | 22 | all : ${TARGET} 23 | 24 | simpleMPI: simpleMPI.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 26 | 27 | enable_mps: 28 | export CUDA_VISIBLE_DEVICES=0 29 | sudo nvidia-smi -c 3 -i 0 30 | sudo nvidia-cuda-mps-control -d 31 | 32 | disable_mps: 33 | echo "quit" | sudo nvidia-cuda-mps-control 34 | sudo nvidia-smi -c 0 -i 0 35 | 36 | nvprof: simpleMPI 37 | mpirun -np ${PROCS} nvprof -f -o $+.%q{OMPI_COMM_WORLD_RANK}_${STREAMS}.nvvp ./$+ ${STREAMS} 38 | 39 | clean: 40 | rm -f ${TARGET} *.o *.nvvp 41 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/09_mps/install_mpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MPI_VERSION="3.0.4" 3 | 4 | wget -O /tmp/openmpi-${MPI_VERSION}.tar.gz https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-${MPI_VERSION}.tar.gz 5 | tar xzf /tmp/openmpi-${MPI_VERSION}.tar.gz -C /tmp 6 | cd /tmp/openmpi-${MPI_VERSION} 7 | ./configure --enable-orterun-prefix-by-default 8 | make -j $(nproc) all && sudo make install 9 | sudo ldconfig 10 | mpirun --version -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/10_kernel_execution_overhead/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | MPICC ?= mpicc 5 | TARGET=cuda_kernel 6 | 7 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 8 | NVCC_FLAGS=-m64 -rdc=true # --resource-usage 9 | 10 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 11 | 12 | # Gencode argumentes 13 | SMS = 35 37 50 52 60 61 70 75 14 | ifeq "$(IS_CUDA_11)" "1" 15 | SMS = 52 60 61 70 75 80 16 | endif 17 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 18 | 19 | LIBRARIES += 20 | ALL_CCFLAGS += ${NVCC_FLAGS} 21 | 22 | all : ${TARGET} 23 | 24 | cuda_kernel: cuda_kernel.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES) 26 | 27 | nvprof: cuda_kernel 28 | nvprof -f -o $+.nvvp ./$+ 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter04/04_kernel_execution/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 23 | 24 | test: 25 | echo $(DIRECTORY) -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/.gitignore: -------------------------------------------------------------------------------- 1 | /Debug/ 2 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/01_focused_profile/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-G 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | ALL_CCFLAGS +=-m64 -g $(NVCC_FLAGS) $(INCLUDES) 19 | 20 | all : ${TARGET} 21 | 22 | sgemm: sgemm.cu 23 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 24 | 25 | clean: 26 | rm -f ${TARGET} *.o -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/02_nvtx/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS= -lnvToolsExt 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | ALL_CCFLAGS +=-m64 -g $(NVCC_FLAGS) $(INCLUDES) 19 | 20 | all : ${TARGET} 21 | 22 | sgemm: sgemm.cu 23 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 24 | 25 | nvprof: sgemm 26 | nvprof -f --profile-from-start off -o sgemm.nvvp ./sgemm.nvvp 27 | 28 | clean: 29 | rm -f ${TARGET} *.o *.nvvp -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/03_cuda_error/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | 21 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES) 22 | 23 | all : ${TARGET} 24 | 25 | sgemm: sgemm.cu 26 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | clean: 29 | rm -f ${TARGET} *.o 30 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/04_cuda_assert/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo -G # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | 21 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES) 22 | 23 | all : ${TARGET} 24 | 25 | sgemm: sgemm.cu 26 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | clean: 29 | rm -f ${TARGET} *.o 30 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/.gitignore: -------------------------------------------------------------------------------- 1 | /Release/ 2 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | 06_debug_with_eclipse 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.core.ccnature 24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 26 | 27 | 28 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/.settings/org.eclipse.ltk.core.refactoring.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false 3 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/Debug/objects.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | USER_OBJS := 6 | 7 | LIBS := 8 | 9 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/Debug/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | O_SRCS := 6 | CPP_SRCS := 7 | C_UPPER_SRCS := 8 | C_SRCS := 9 | S_UPPER_SRCS := 10 | OBJ_SRCS := 11 | CU_SRCS := 12 | ASM_SRCS := 13 | CXX_SRCS := 14 | C++_SRCS := 15 | CC_SRCS := 16 | CU_DEPS := 17 | OBJS := 18 | C++_DEPS := 19 | C_DEPS := 20 | CC_DEPS := 21 | CPP_DEPS := 22 | EXECUTABLES := 23 | CXX_DEPS := 24 | C_UPPER_DEPS := 25 | 26 | # Every subdirectory with source files must be described here 27 | SUBDIRS := \ 28 | src \ 29 | 30 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/Debug/src/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CU_SRCS += \ 7 | ../src/simple_sgemm.cu 8 | 9 | CU_DEPS += \ 10 | ./src/simple_sgemm.d 11 | 12 | OBJS += \ 13 | ./src/simple_sgemm.o 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | src/%.o: ../src/%.cu 18 | @echo 'Building file: $<' 19 | @echo 'Invoking: NVCC Compiler' 20 | /usr/local/cuda-10.0/bin/nvcc -G -g -O0 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -odir "src" -M -o "$(@:%.o=%.d)" "$<" 21 | /usr/local/cuda-10.0/bin/nvcc -G -g -O0 --compile --relocatable-device-code=false -gencode arch=compute_60,code=compute_60 -gencode arch=compute_61,code=compute_61 -gencode arch=compute_70,code=compute_70 -gencode arch=compute_75,code=compute_75 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -x cu -o "$@" "$<" 22 | @echo 'Finished building: $<' 23 | @echo ' ' 24 | 25 | 26 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/06_debug_with_eclipse/java-7-install.md: -------------------------------------------------------------------------------- 1 | Nsight Eclipse Edition requires java 1.7 for their java runtime engine. 2 | 3 | However, the latest linux platform does not provides java 7 from their support so it is requred to install java 7 manually. 4 | To tell the truth, JRE is installed with CUDA toolkit. However it installs the latest one and does not resolve the dependency issue. 5 | 6 | 7 | Firstly, download JRE from the oracle's [site](https://www.oracle.com/technetwork/java/javase/downloads/java-archive-downloads-javase7-521261.html). 8 | 9 | Untar the file and move the files into the proper path. 10 | ```bash 11 | $ tar xzf jdk-7u80-linux-x64.tar.gz 12 | $ sudo mkdir /usr/lib/jvm 13 | $ sudo mv jdk1.7.0_80 /usr/lib/jvm 14 | ``` 15 | 16 | In general, the system will use the latest java version. To set to use older java version, select the older version with this command. 17 | 18 | ```bash 19 | $ sudo update-alternatives --config java 20 | 21 | ``` 22 | 23 | For example, update-alternatives gives several installed java version. 24 | 25 | ``` 26 | There are 2 choices for the alternative java (providing /usr/bin/java). 27 | 28 | Selection Path Priority Status 29 | ------------------------------------------------------------ 30 | * 0 /usr/lib/jvm/java-11-openjdk-amd64/bin/java 1111 auto mode 31 | 1 /usr/lib/jvm/java-11-openjdk-amd64/bin/java 1111 manual mode 32 | 2 /usr/lib/jvm/jre1.7.0_80/bin/java 1 manual mode 33 | 34 | Press to keep the current choice[*], or type selection number: 35 | ``` 36 | 37 | 38 | Put number 2 in this case to use java 1.7.0. 39 | 40 | Do this for the rest of JRE runtime file. 41 | ```bash 42 | $ sudo update-alternavies --config javaws 43 | ``` 44 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/07_debug_with_gdb/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=simple_sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -G -Xcompiler -rdynamic 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | 21 | ALL_CCFLAGS += -g $(NVCC_FLAGS) $(INCLUDES) 22 | 23 | all : ${TARGET} 24 | 25 | simple_sgemm: simple_sgemm.cu 26 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | clean: 29 | rm -f ${TARGET} *.o 30 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/08_memcheck/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=simple_sgemm simple_sgemm_oob simple_sgemm_mem_leak 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lgomp 20 | 21 | ALL_CCFLAGS += -g -G -Xcompiler -rdynamic $(NVCC_FLAGS) $(INCLUDES) 22 | 23 | all : ${TARGET} 24 | 25 | simple_sgemm: simple_sgemm.cu 26 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | simple_sgemm_oob: simple_sgemm_oob.cu 29 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 30 | 31 | simple_sgemm_mem_leak: simple_sgemm_mem_leak.cu 32 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 33 | 34 | clean: 35 | rm -f ${TARGET} *.o 36 | -------------------------------------------------------------------------------- /Chapter05/05_debug_profiling/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 23 | 24 | test: 25 | echo $(DIRECTORY) -------------------------------------------------------------------------------- /Chapter06/06_multigpu/01_gaussian_single_gpu/config.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #define INPUT_TYPE RANDOM 5 | 6 | // Linear system parameters 7 | #define ROWS 300 // Number of rows in the system. 8 | #define COLS 256 // Number of columns in the system 9 | #define PERCENTAGE 50 // Density of coefficient matrix 10 | 11 | #define REFERENCE_SOLUTION "original-matrix" 12 | #define COMPUTED_SOLUTION "computed-solution" 13 | 14 | #define PACK_SIZE (8*sizeof(unsigned int)) 15 | // Ceil of numerator divided by denominator. 16 | #define intCeilDiv(numerator, denominator) (((numerator) + (denominator) - 1)/ (denominator)) 17 | #endif 18 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/02_gaussian_multi_gpu/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | NVCC = nvcc 3 | 4 | ROWS ?= 11000 5 | COLS ?= 10000 6 | 7 | PARAMS = -DROWS=$(ROWS) -DCOLS=$(COLS) 8 | 9 | CFLAGS = -O3 $(PARAMS) -I$(CUDA_PATH)/include/ 10 | #NV_CFLAGS = -gencode arch=compute_20,code=sm_20 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_60,code=sm_60 -O3 --ptxas-options=-v -lineinfo $(PARAMS) -I$(CUDA_PATH)/include/ -Wno-deprecated-gpu-targets 11 | NV_CFLAGS = -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -O3 -lineinfo $(PARAMS) -I$(CUDA_PATH)/include/ -Wno-deprecated-gpu-targets 12 | 13 | BINARY = gaussian_multi_gpu_p2p.out 14 | all: $(BINARY) 15 | 16 | OBJECTS = gaussian_multi_gpu_p2p.o 17 | 18 | $(BINARY): $(OBJECTS) 19 | $(NVCC) $(NV_CFLAGS) -dlink $(OBJECTS) -o gpuObjectCode.o 20 | $(NVCC) $(NV_CFLAGS) gpuObjectCode.o $(OBJECTS) -o $(BINARY) 21 | 22 | %.o : %.c #default rule for making .o files from .c 23 | $(info --- Building '$@' from '$<' using default rule 1) 24 | $(CC) $(CFLAGS) -c -o $@ $< 25 | 26 | %.o : %.cu #default rule for making .o files from .cu 27 | $(info --- Building '$@' from '$<' using default rule 2) 28 | $(NVCC) $(NV_CFLAGS) -dc -o $@ $< 29 | 30 | clean: 31 | rm -rf *o $(BINARY) 32 | 33 | run: $(BINARY) 34 | ./$(BINARY) 35 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/02_gaussian_multi_gpu/config.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #define INPUT_TYPE RANDOM 5 | 6 | // Linear system parameters 7 | #define ROWS 300 // Number of rows in the system. 8 | #define COLS 256 // Number of columns in the system 9 | #define PERCENTAGE 50 // Density of coefficient matrix 10 | 11 | #define REFERENCE_SOLUTION "original-matrix" 12 | #define COMPUTED_SOLUTION "computed-solution" 13 | 14 | #define PACK_SIZE (8*sizeof(unsigned int)) 15 | // Ceil of numerator divided by denominator. 16 | #define intCeilDiv(numerator, denominator) (((numerator) + (denominator) - 1)/ (denominator)) 17 | 18 | // How many GPUs should be used by solver. Effective only with MULTI_GPU 19 | #define NUMBER_OF_GPU 2 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/03_helloWorldMPI/helloWorldMPI.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(int argc, char *argv[]) { 4 | int rank,size; 5 | /* Initialize the MPI library */ 6 | MPI_Init(&argc,&argv); 7 | /* Determine the calling process rank and total number of ranks */ 8 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 9 | MPI_Comm_size(MPI_COMM_WORLD,&size); 10 | /* Compute based on process rank */ 11 | /* Call MPI routines like MPI_Send, MPI_Recv, ... */ 12 | printf("\n Rank %d, Size %d", rank,size); 13 | /* Shutdown MPI library */ 14 | MPI_Finalize(); 15 | return 0; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/04_gaussian_multi_node/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | NVCC = nvcc 3 | MPICC = mpiCC 4 | 5 | MPIRUN = mpirun 6 | CUDA_PATH = /usr/local/cuda 7 | 8 | ROWS ?= 11000 9 | COLS ?= 10000 10 | 11 | PARAMS = -DROWS=$(ROWS) -DCOLS=$(COLS) 12 | 13 | HOSTFILE ?= myHosts 14 | 15 | CFLAGS = -O3 $(PARAMS) -I$(CUDA_PATH)/include/ -I$(MPI_PATH)/include 16 | INCLUDES = -I$(CUDA_PATH)/include/ -I$(CUDA_PATH)/samples/inc -I$(CUDA_PATH)/targets/x86_64-linux/include 17 | 18 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 19 | 20 | # Gencode argumentes 21 | SMS = 35 37 50 52 60 61 70 75 22 | ifeq "$(IS_CUDA_11)" "1" 23 | SMS = 52 60 61 70 75 80 24 | endif 25 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 26 | 27 | NV_CFLAGS = $(GENCODE_FLAGS) -O3 -lineinfo $(PARAMS) $(INCLUDES) -Wno-deprecated-gpu-targets 28 | 29 | BINARY = gaussian_multi_gpu_rdma.out 30 | all: $(BINARY) 31 | 32 | OBJECTS := gaussian_multi_gpu_rdma.o gpuSolver.o gpuSolverFunctions.o linearSystemOps.o utilities.o elementUtilities.o 33 | 34 | $(BINARY): $(OBJECTS) 35 | $(NVCC) $(NV_CFLAGS) -dlink $(OBJECTS) -o gpuObjectCode.o 36 | $(MPICC) gpuObjectCode.o $(OBJECTS) -o $(BINARY) -lcudart -L $(CUDA_PATH)/lib64/ 37 | 38 | %.o : %.c #default rule for making .o files from .c 39 | $(info --- Building '$@' from '$<' using default rule 1) 40 | $(MPICC) $(CFLAGS) -c -o $@ $< 41 | 42 | %.o : %.cu #default rule for making .o files from .cu 43 | $(info --- Building '$@' from '$<' using default rule 2) 44 | $(NVCC) $(NV_CFLAGS) -dc -o $@ $< 45 | 46 | clean: 47 | rm -rf *o $(BINARY) 48 | 49 | run: $(BINARY) 50 | $(MPIRUN) --hostfile $(HOSTFILE) ./$(BINARY) $(ROWS) $(COLS) 51 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/04_gaussian_multi_node/config.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #define INPUT_TYPE RANDOM 5 | #define PIVOTPACK 6 | #define ROWS 33000 // Number of rows in the system 7 | #define COLS 30000 // Number of columns in the system 8 | #define PERCENTAGE 50 // Density of coefficient matrix. Useful only with INPUT_TYPE set to RANDOM 9 | 10 | 11 | 12 | #define REFERENCE_SOLUTION "original-matrix" 13 | #define COMPUTED_SOLUTION "computed-solution" 14 | 15 | // Chose one of the two 16 | // 32 consecutive matrix elements are packed together in an unsigned int 17 | // #define ELEMENT_TYPE_UINT 18 | // 128 consecutive matrix elements are packed together in an uint4 19 | #define ELEMENT_TYPE_UINT4 20 | #endif 21 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/04_gaussian_multi_node/mpiUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef MPIUTILS_H 2 | #define MPIUTILS_H 3 | 4 | #include 5 | 6 | #define MPI_CHECK(call) \ 7 | if((call) != MPI_SUCCESS) { \ 8 | printf("MPI error calling \"%s\"\n", #call); \ 9 | MPI_Abort(MPI_COMM_WORLD, -1); } 10 | 11 | #endif 12 | 13 | 14 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/05_streams/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=vector_addition merging_multi_gpu 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo --default-stream per-thread -Xcompiler -fopenmp # --resource-usage 8 | 9 | LIBRARIES= -lgomp 10 | 11 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 12 | 13 | # Gencode argumentes 14 | SMS = 35 37 50 52 60 61 70 75 15 | ifeq "$(IS_CUDA_11)" "1" 16 | SMS = 52 60 61 70 75 80 17 | endif 18 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 19 | 20 | all: ${TARGET} 21 | 22 | vector_addition: vector_addition.cu 23 | $(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 24 | 25 | merging_multi_gpu: image_merging.cu scrImagePgmPpmPackage.cu 26 | $(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | clean: 29 | rm -f ${TARGET} *.o 30 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/05_streams/cat.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter06/06_multigpu/05_streams/cat.pgm -------------------------------------------------------------------------------- /Chapter06/06_multigpu/05_streams/dog.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter06/06_multigpu/05_streams/dog.pgm -------------------------------------------------------------------------------- /Chapter06/06_multigpu/05_streams/scrImagePgmPpmPackage.h: -------------------------------------------------------------------------------- 1 | #ifndef _Xiang_Gao_PGM_PPM_Header_ 2 | #define _Xiang_Gao_PGM_PPM_Header_ 3 | 4 | #include 5 | 6 | 7 | int scr_read_pgm( char* name, unsigned char* image, int irows, int icols ); 8 | void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment ); 9 | int scr_read_ppm( char* name, unsigned char* image, int irows, int icols ); 10 | void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment ); 11 | void get_PgmPpmParams(char * , int *, int *); 12 | void getout_comment(FILE * ); 13 | #endif 14 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/06_nccl/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=nccl 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo -lnccl # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | all: ${TARGET} 19 | 20 | nccl: nccl.cu 21 | $(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 22 | 23 | clean: 24 | rm -f ${TARGET} *.o 25 | -------------------------------------------------------------------------------- /Chapter06/06_multigpu/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 23 | 24 | test: 25 | echo $(DIRECTORY) -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/01_sgemm_optimization/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=sgemm 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -lgomp 19 | ALL_CCFLAGS += -std=c++11 -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES) 20 | 21 | all : ${TARGET} 22 | 23 | sgemm: sgemm.cu 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 25 | 26 | nvprof: sgemm 27 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 28 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 29 | 30 | clean: 31 | rm -f ${TARGET} *.o 32 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/02_convolution/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=convolution 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS= -lineinfo --maxrregcount=48 --resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib -lgomp 19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES) 20 | 21 | all : ${TARGET} 22 | 23 | convolution: convolution.cu 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 25 | 26 | nvprof: convolution 27 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 28 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/03_scan/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=scan 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 --resource-usage -lineinfo 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib 19 | ALL_CCFLAGS += -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES) 20 | 21 | all : ${TARGET} 22 | 23 | scan_v1.o: scan_v1.cu 24 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 25 | 26 | scan_v2.o: scan_v2.cu 27 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 28 | 29 | scan: scan.cu scan_v1.o scan_v2.o 30 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 31 | 32 | nvprof: scan 33 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 34 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 35 | 36 | clean: 37 | rm -f ${TARGET} *.o *.nvvp 38 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/03_scan/scan.h: -------------------------------------------------------------------------------- 1 | #ifndef _SCAN_H_ 2 | #define _SCAN_H_ 3 | 4 | #define BLOCK_DIM 512 5 | 6 | #define DEBUG_INDEX 0 7 | #define DEBUG_OUTPUT_NUM 16 8 | 9 | void scan_v1(float *d_output, float *d_input, int length); 10 | void scan_v2(float *d_output, float *d_input, int length); 11 | 12 | #endif // _SCAN_H_ -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/03_scan/scan_v1.cu: -------------------------------------------------------------------------------- 1 | #include "scan.h" 2 | 3 | __global__ void 4 | scan_v1_kernel(float *d_output, float *d_input, int length) 5 | { 6 | int idx = blockDim.x * blockIdx.x + threadIdx.x; 7 | 8 | float element = 0.f; 9 | for (int offset = 0; offset < length; offset++) { 10 | if (idx - offset >= 0) 11 | element += d_input[idx - offset]; 12 | } 13 | d_output[idx] = element; 14 | } 15 | 16 | void scan_v1(float *d_output, float *d_input, int length) 17 | { 18 | dim3 dimBlock(BLOCK_DIM); 19 | dim3 dimGrid((length + BLOCK_DIM - 1) / BLOCK_DIM); 20 | scan_v1_kernel<<>>(d_output, d_input, length); 21 | } -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/03_scan/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _UTILS_H_ 2 | #define _UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // generate input data 9 | void generate_data(float *ptr, int length) 10 | { 11 | // fill the buffer with random generated unsigned integers 12 | for (int i = 0; i < length; i++) 13 | ptr[i] = (rand() - RAND_MAX/2) / (float)RAND_MAX; 14 | } 15 | 16 | bool validation(float *a, float *b, int length) 17 | { 18 | float epsilon = 0.000001; 19 | bool result = true; 20 | for (int i = 0; i < length; i++) { 21 | if (abs(a[i] - b[i]) >= epsilon) { 22 | result = false; 23 | printf("result mismatch on %d th item. (%f) \n", i, abs(a[i] - b[i])); 24 | } 25 | } 26 | return result; 27 | } 28 | 29 | void print_val(float *h_list, int length, ...) 30 | { 31 | va_list argptr; 32 | va_start(argptr, length); 33 | 34 | printf("%s\t", va_arg(argptr, char *)); 35 | for (int i = 0; i < length; i++) 36 | printf("%7.4f\t", h_list[i]); 37 | printf("\n"); 38 | } 39 | 40 | #endif // _UTILS_H_ -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/04_pack_n_split/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=pack_n_split 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -G # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib 19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) $(LIBRARIES) 20 | 21 | all : ${TARGET} 22 | 23 | pack_n_split: pack_n_split.cu 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 25 | 26 | nvprof: pack_n_split 27 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 28 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/05_n-body/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=n-body 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib 19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) $(LIBRARIES) 20 | 21 | all : ${TARGET} 22 | 23 | n-body: n_body.cu 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 25 | 26 | nvprof: n-body 27 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 28 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 29 | 30 | clean: 31 | rm -f ${TARGET} *.o *.nvvp 32 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/05_n-body/n_body.h: -------------------------------------------------------------------------------- 1 | 2 | #define BLOCK_SIZE 128 3 | #define SOFTENING 1e-9f 4 | 5 | 6 | typedef struct { 7 | float4 *pos, *vel; 8 | } NBodySystem; 9 | 10 | void generateRandomizeBodies(float *data, int n); 11 | __global__ void calculateBodyForce(float4 *p, float4 *v, float dt, int n); 12 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/06_quicksort/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=quick_sort 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | ALL_CCFLAGS += -rdc=true $(NVCC_FLAGS) $(INCLUDES) 19 | 20 | all : ${TARGET} 21 | 22 | quick_sort: quick_sort.cu 23 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 24 | 25 | nvprof: sgemm 26 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 27 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 28 | 29 | clean: 30 | rm -f ${TARGET} *.o 31 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/07_radixsort/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=radix_warp_sort thrust_radix_sort 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | ALL_CCFLAGS += -rdc=true $(NVCC_FLAGS) $(INCLUDES) 19 | 20 | all : ${TARGET} 21 | 22 | radix_warp_sort: radix_warp_sort.cu 23 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 24 | 25 | thrust_radix_sort: thrust_radix_sort.cu 26 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 27 | 28 | nvprof: sgemm 29 | nvprof -f -o $+.nvvp --profile-from-start off ./$+ 30 | nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+ 31 | 32 | clean: 33 | rm -f ${TARGET} *.o 34 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/07_radixsort/thrust_radix_sort.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Helper routines 8 | void initialize(thrust::device_vector& v) 9 | { 10 | thrust::default_random_engine rng(123456); 11 | thrust::uniform_int_distribution dist(10, 99); 12 | for(size_t i = 0; i < v.size(); i++) 13 | v[i] = dist(rng); 14 | } 15 | 16 | void print(const thrust::device_vector& v) 17 | { 18 | for(size_t i = 0; i < v.size(); i++) 19 | std::cout << " " << v[i]; 20 | std::cout << "\n"; 21 | } 22 | 23 | 24 | int main(void) 25 | { 26 | size_t N = 16; 27 | 28 | std::cout << "sorting integers\n"; 29 | { 30 | thrust::device_vector keys(N); 31 | initialize(keys); 32 | print(keys); 33 | thrust::sort(keys.begin(), keys.end()); 34 | print(keys); 35 | } 36 | 37 | std::cout << "\nsorting integers (descending)\n"; 38 | { 39 | thrust::device_vector keys(N); 40 | initialize(keys); 41 | print(keys); 42 | thrust::sort(keys.begin(), keys.end(), thrust::greater()); 43 | print(keys); 44 | } 45 | 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/08_histogram/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=image_histogram 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | ALL_CCFLAGS += -rdc=true $(NVCC_FLAGS) $(INCLUDES) 19 | 20 | all : ${TARGET} 21 | 22 | image_histogram: scrImagePgmPpmPackage.cpp image_histogram.cu 23 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $+ $(LIBRARIES) 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 25 | 26 | clean: 27 | rm -f ${TARGET} *.o 28 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/08_histogram/aerosmith-double.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter07/07_parallel_programming_pattern/08_histogram/aerosmith-double.pgm -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/08_histogram/scrImagePgmPpmPackage.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | 5 | int scr_read_pgm( char* name, unsigned char* image, int irows, int icols ); 6 | void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment ); 7 | int scr_read_ppm( char* name, unsigned char* image, int irows, int icols ); 8 | void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment ); 9 | void get_PgmPpmParams(char * , int *, int *); 10 | void getout_comment(FILE * ); 11 | -------------------------------------------------------------------------------- /Chapter07/07_parallel_programming_pattern/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | .PHONY: clean 23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 24 | 25 | test: 26 | echo $(DIRECTORY) 27 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/01_sgemm/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cublasSgemm cublasXtSgemm cublasSgemm_async 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lcublas 20 | ALL_CCFLAGS += -std=c++11 21 | 22 | all : ${TARGET} 23 | 24 | cublasSgemm: cublasSgemm.cpp 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 26 | 27 | cublasXtSgemm: cublasXtSgemm.cpp 28 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 29 | 30 | cublasSgemm_async: cublasSgemm_async.cpp 31 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 32 | 33 | nvprof: cublas 34 | nvprof -f -o $+.nvvp ./$+ 35 | 36 | clean: 37 | rm -f ${TARGET} *.o -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/02_sgemm_mixed_precision/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cublasGemmEx 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lcublas 20 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib 21 | 22 | all : ${TARGET} 23 | 24 | cublasGemmEx: cublasGemmEx.cu 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 26 | 27 | nvprof: cublas 28 | nvprof -f -o $+.nvvp ./$+ 29 | 30 | clean: 31 | rm -f ${TARGET} *.o 32 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/03_curand/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=curand_host curand_device gemm_with_curand_host 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lcublas -lcurand 20 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib 21 | 22 | all : ${TARGET} 23 | 24 | curand_host: curand_host.cpp 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 26 | 27 | curand_device: curand_device.cu 28 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 29 | 30 | fp16.o: fp16.cu 31 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) $(LIBRARIES) -o $@ -c $< 32 | 33 | gemm_with_curand_host.o: gemm_with_curand_host.cpp 34 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 35 | 36 | gemm_with_curand_host: gemm_with_curand_host.o fp16.o 37 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 38 | 39 | nvprof: curand_host 40 | nvprof -f -o $+.nvvp ./$+ 41 | 42 | clean: 43 | rm -f ${TARGET} *.o 44 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/03_curand/fp16.cu: -------------------------------------------------------------------------------- 1 | #include "fp16.cuh" 2 | #include 3 | 4 | #define BLOCK_DIM 512 5 | 6 | namespace fp16 7 | { 8 | __global__ void float2half_kernel(half *out, float *in) 9 | { 10 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 11 | 12 | out[idx] = __float2half(in[idx]); 13 | } 14 | 15 | __global__ void half2float_kernel(float *out, half *in) 16 | { 17 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 18 | 19 | out[idx] = __half2float(in[idx]); 20 | } 21 | 22 | void float2half(half *out, float *in, size_t length) 23 | { 24 | float2half_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in); 25 | } 26 | 27 | void half2float(float *out, half *in, size_t length) 28 | { 29 | half2float_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in); 30 | } 31 | } // namespace fp16 -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/03_curand/fp16.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _FP16_CUH_ 2 | #define _FP16_CUH_ 3 | 4 | #include 5 | 6 | namespace fp16 7 | { 8 | void float2half(half *out, float *in, size_t length); 9 | void half2float(float *out, half *in, size_t lenght); 10 | } 11 | 12 | #endif // _FP16_CUH_ -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/04_cufft/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda/ 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=cufft.1d cufft.half cufft.mgpu 5 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 6 | NVCC_FLAGS=-m64 # --resource-usage 7 | 8 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 9 | 10 | # Gencode argumentes 11 | SMS = 35 37 50 52 60 61 70 75 12 | ifeq "$(IS_CUDA_11)" "1" 13 | SMS = 52 60 61 70 75 80 14 | endif 15 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 16 | 17 | # Openmp 18 | LIBRARIES += -lcufft -lcurand 19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib 20 | 21 | all : ${TARGET} 22 | 23 | cufft.1d: cufft.1d.cpp 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 25 | 26 | complex.o: complex.cu 27 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 28 | 29 | fp16.o: fp16.cu 30 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 31 | 32 | cufft.half: cufft.half.cpp complex.o fp16.o 33 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 34 | 35 | cufft.mgpu: cufft.mgpu.cu complex.o 36 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 37 | 38 | nvprof: cufft.1d 39 | nvprof -f -o $+.nvvp ./$+ 40 | 41 | clean: 42 | rm -f ${TARGET} *.o 43 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/04_cufft/complex.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "helper.cuh" 3 | 4 | namespace op 5 | { 6 | __global__ void FloatToComplex_kernel(cufftComplex *complex, const float *real, const float *imag) 7 | { 8 | int idx = blockDim.x * blockIdx.x + threadIdx.x; 9 | 10 | complex[idx].x = real[idx]; 11 | if (imag != nullptr) 12 | complex[idx].y = imag[idx]; 13 | } 14 | 15 | void FloatToComplex(cufftComplex *complex, const float *real, const float *imag, const size_t length) 16 | { 17 | dim3 dimBlock(512); 18 | dim3 dimGrid((length + dimBlock.x - 1) / dimBlock.x); 19 | 20 | FloatToComplex_kernel<<< dimGrid, dimBlock >>>(complex, real, imag); 21 | } 22 | } -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/04_cufft/fp16.cu: -------------------------------------------------------------------------------- 1 | #include "helper.cuh" 2 | #include "fp16.cuh" 3 | #include 4 | 5 | #define BLOCK_DIM 512 6 | 7 | namespace fp16 8 | { 9 | __global__ void float2half_kernel(half *out, float *in) 10 | { 11 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 12 | 13 | out[idx] = __float2half(in[idx]); 14 | } 15 | 16 | __global__ void half2float_kernel(float *out, half *in) 17 | { 18 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 19 | 20 | out[idx] = __half2float(in[idx]); 21 | } 22 | 23 | void float2half(half *out, float *in, size_t length) 24 | { 25 | float2half_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in); 26 | } 27 | 28 | void half2float(float *out, half *in, size_t length) 29 | { 30 | half2float_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in); 31 | } 32 | } // namespace fp16 -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/04_cufft/fp16.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _FP16_CUH_ 2 | #define _FP16_CUH_ 3 | 4 | #include 5 | 6 | namespace fp16 7 | { 8 | void float2half(half *out, float *in, size_t length); 9 | void half2float(float *out, half *in, size_t lenght); 10 | } 11 | 12 | #endif // _FP16_CUH_ -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/04_cufft/helper.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _HELPER_CU_H_ 2 | #define _HELPER_CU_H_ 3 | 4 | #include 5 | #include 6 | #include "fp16.cuh" 7 | 8 | namespace op { 9 | template 10 | typename std::enable_if::value>::type 11 | curand(curandGenerator_t generator, 12 | T *buffer, 13 | size_t length) 14 | { 15 | curandGenerateUniform(generator, buffer, length); 16 | } 17 | 18 | void FloatToComplex(cufftComplex *complex, const float *real, const float *imag, const size_t length); 19 | 20 | template 21 | typename std::enable_if::value>::type 22 | curand(curandGenerator_t generator, 23 | T *buffer, 24 | size_t length) 25 | { 26 | float *buffer_fp32; 27 | 28 | cudaMalloc((void **)&buffer_fp32, length * sizeof(float)); 29 | curandGenerateUniform(generator, buffer_fp32, length); 30 | 31 | // convert generated real data into complex type 32 | FloatToComplex(buffer, buffer_fp32, nullptr, length); 33 | cudaFree(buffer_fp32); 34 | } 35 | 36 | template 37 | typename std::enable_if::value>::type 38 | curand(curandGenerator_t generator, 39 | T *buffer, 40 | size_t length) 41 | { 42 | float *buffer_fp32; 43 | 44 | cudaMalloc((void **)&buffer_fp32, length * sizeof(float)); 45 | curandGenerateUniform(generator, buffer_fp32, length); 46 | 47 | // convert generated single floating to half floating 48 | fp16::float2half(buffer, buffer_fp32, length); 49 | cudaFree(buffer_fp32); 50 | } 51 | } 52 | 53 | #endif // _HELPER_CU_H_ -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/05_npp/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=imageFilter statisticsNPP 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | # Openmp 19 | LIBRARIES += -lnppc -lnppif -lnppisu -lnppig -lnpps -lfreeimage 20 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib 21 | 22 | all : ${TARGET} 23 | 24 | imageFilter: imageFilter.cpp 25 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 26 | 27 | statisticsNPP: statisticsNPP.cpp 28 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 29 | 30 | nvprof: imageFilter 31 | nvprof -f -o $+.nvvp ./$+ 32 | 33 | clean: 34 | rm -f ${TARGET} *.o 35 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/05_npp/flower.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter08/08_cuda_libs_and_other_languages/05_npp/flower.jpg -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/05_npp/output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter08/08_cuda_libs_and_other_languages/05_npp/output.jpg -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/06_opencv/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=test blur blur_stream 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc $(shell pkg-config opencv4 --cflags) 7 | NVCC_FLAGS=-m64 # --resource-usage 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib -L/usr/local/lib $(shell pkg-config opencv4 --libs) 19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) $(LIBRARIES) 20 | 21 | all : ${TARGET} 22 | 23 | test: test.cpp 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 25 | 26 | blur: blur.cpp 27 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 28 | 29 | blur_stream: blur_stream.cpp 30 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 31 | 32 | nvprof: blur 33 | nvprof -f -o $+.nvvp ./$+ 34 | 35 | clean: 36 | rm -f ${TARGET} *.o 37 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/06_opencv/blur.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "opencv2/opencv.hpp" 4 | 5 | using namespace cv; 6 | 7 | void BlurHost(std::string filename) 8 | { 9 | Mat src = imread(filename, 1); 10 | Mat dst; 11 | 12 | TickMeter tm; 13 | 14 | tm.start(); 15 | bilateralFilter(src, dst, 21, 150, 150); 16 | tm.stop(); 17 | std::cout << "CPU Time: " << tm.getTimeMilli() << " ms." << std::endl; 18 | 19 | imwrite("result_host.jpg", dst); 20 | } 21 | 22 | void BlurCuda(std::string filename) 23 | { 24 | TickMeter tm; 25 | 26 | Mat src = imread(filename, 1); 27 | Mat dst; 28 | cuda::GpuMat src_cuda(src); 29 | cuda::GpuMat dst_cuda; 30 | 31 | // warm-up 32 | cuda::bilateralFilter(src_cuda, dst_cuda, 21, 150.f, 150.f); 33 | 34 | tm.start(); 35 | src_cuda.upload(src); 36 | cuda::bilateralFilter(src_cuda, dst_cuda, 21, 150.f, 150.f); 37 | dst_cuda.download(dst); 38 | tm.stop(); 39 | std::cout << "GPU Time: " << tm.getTimeMilli() << " ms." << std::endl; 40 | 41 | imwrite("result_cuda.jpg", dst); 42 | } 43 | 44 | int main(int argc, char *argv[]) 45 | { 46 | cuda::printCudaDeviceInfo(0); 47 | cuda::printShortCudaDeviceInfo(0); 48 | std::cout << "Device: " << cuda::getCudaEnabledDeviceCount() << std::endl; 49 | 50 | std::string filename("flower.jpg"); 51 | 52 | 53 | BlurHost(filename); 54 | BlurCuda(filename); 55 | 56 | return 0; 57 | } -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/06_opencv/flower.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter08/08_cuda_libs_and_other_languages/06_opencv/flower.JPG -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/06_opencv/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "opencv2/opencv.hpp" 3 | #include "opencv2/core/cuda.hpp" 4 | #include "opencv2/cudafilters.hpp" 5 | #include "opencv2/cudaimgproc.hpp" 6 | 7 | using namespace cv; 8 | 9 | int main( int argc, char* argv[] ) 10 | { 11 | const int64 start = getTickCount(); 12 | 13 | cv::Mat src = cv::imread( "flower.jpg", 0 ); 14 | 15 | if( !src.data ) exit( 1 ); 16 | 17 | cv::cuda::GpuMat d_src( src ); 18 | cv::cuda::GpuMat d_dst; 19 | 20 | cv::cuda::bilateralFilter( d_src, d_dst, -1, 50, 7 ); 21 | Ptr canny = cuda::createCannyEdgeDetector( 35.0, 200.0 ); 22 | canny->detect( d_src, d_dst ); 23 | 24 | cv::Mat dst( d_dst ); 25 | 26 | cv::imwrite( "cuda_canny.png", dst ); 27 | 28 | const double timeSec = (getTickCount() - start) / getTickFrequency(); 29 | std::cout << "Time : " << timeSec << " sec" << std::endl; 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/07_python_cuda/cupy_op.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cupy as cp 3 | 4 | # cupy matmul 5 | a = cp.random.uniform(0, 1, (2, 4)).astype('float32') 6 | b = cp.random.uniform(0, 1, (4, 2)).astype('float32') 7 | c = cp.matmul(a, b) 8 | print("Matrix Multiplication") 9 | print("a::\n", a) 10 | print("b::\n", b) 11 | print("c = a' * b::", c) 12 | 13 | # custom kernel 14 | squared_diff = cp.ElementwiseKernel( 15 | 'float32 x, float32 y', 16 | 'float32 z', 17 | 'z = (x - y) * (x - y)', 18 | 'squared_diff') 19 | 20 | a = cp.random.uniform(0, 1, (2, 4)).astype('float32') 21 | b = cp.random.uniform(0, 1, (2, 4)).astype('float32') 22 | c = squared_diff(a, b) 23 | print("Elements Diff") 24 | print("a::\n", a) 25 | print("b::\n", b) 26 | print("c = (a-b)*(a-b)::", c) 27 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/07_python_cuda/numba_matmul.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numba import cuda 3 | from timeit import default_timer as timer 4 | 5 | @cuda.jit 6 | def matmul(d_c, d_a, d_b): 7 | x, y = cuda.grid(2) 8 | if (x < d_c.shape[0] and y < d_c.shape[1]): 9 | sum = 0 10 | for k in range(d_a.shape[1]): 11 | sum += d_a[x, k] * d_b[k, y] 12 | d_c[x, y] = sum 13 | 14 | # initialize input data 15 | N = 8192 16 | a = np.random.rand(N, N).astype(np.float32) 17 | b = np.random.rand(N, N).astype(np.float32) 18 | 19 | # copy matrices to the devices 20 | d_a = cuda.to_device(a) 21 | d_b = cuda.to_device(b) 22 | 23 | # create device memory for matrix c 24 | d_c = cuda.device_array((N, N)) 25 | 26 | # configure the blocks 27 | BLOCK_DIM = 16 28 | dimBlock = (BLOCK_DIM, BLOCK_DIM) 29 | dimGrid = (int((N + BLOCK_DIM - 1) / BLOCK_DIM), 30 | int((N +BLOCK_DIM - 1) / BLOCK_DIM)) 31 | 32 | # matrix multiplication (gpu) 33 | start = timer() 34 | matmul[dimGrid, dimBlock](d_c, d_a, d_b) 35 | elapsed_time_gpu = (timer() - start) * 1e3 36 | 37 | # copy the result back to the host 38 | c = d_c.copy_to_host() 39 | 40 | # matrix multiplication (cpu) 41 | start = timer() 42 | c_host = np.matmul(a, b) 43 | elapsed_time_cpu = (timer() - start) * 1e3 44 | 45 | # print elapse times 46 | print("Elapsed Time") 47 | print("GPU: %.3f ms" % elapsed_time_gpu) 48 | print("CPU: %.3f ms" % elapsed_time_cpu) 49 | 50 | if (np.allclose(c_host, c)): 51 | print("Done.") 52 | else: 53 | print("GPU and host results are mismatching.") 54 | 55 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/07_python_cuda/numba_saxpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numba import vectorize 3 | from timeit import default_timer as timer 4 | 5 | @vectorize(["float32(float32, float32, float32)"], target='cuda') 6 | def saxpy_cuda(scala, a, b): 7 | return scala * a + b 8 | 9 | 10 | #@vectorize(["float32(float32, float32, float32)"], target='cpu') 11 | @vectorize(["float32(float32, float32, float32)"], target='parallel') 12 | def saxpy_host(scala, a, b): 13 | return scala * a + b 14 | 15 | scala = 2.0 16 | np.random.seed(2019) 17 | 18 | print("size \t\t CUDA \t\t CPU") 19 | for i in range(16,20): 20 | N = 1 << i 21 | a = np.random.rand(N).astype(np.float32) 22 | b = np.random.rand(N).astype(np.float32) 23 | c = np.zeros(N, dtype=np.float32) 24 | 25 | # warm-up 26 | c = saxpy_cuda(scala, a, b) 27 | 28 | # measuring execution time 29 | start = timer() 30 | c = saxpy_host(scala, a, b) 31 | elapsed_time_host= (timer() - start) * 1e3 32 | 33 | start = timer() 34 | c = saxpy_cuda(scala, a, b) 35 | elapsed_time_cuda = (timer() - start) * 1e3 36 | 37 | print("[%d]: \t%.3f ms\t %.3f ms" % (N, elapsed_time_cuda, elapsed_time_host)) 38 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/08_nvblas/exec_fft.m: -------------------------------------------------------------------------------- 1 | # FFT 2 | 3 | num_sample = 8192 4 | x = single(rand(num_sample)); 5 | n_fft = 2^nextpow2(num_sample); 6 | 7 | start = clock(); 8 | y = fft(x, n_fft); 9 | ix = ifft(y, n_fft); 10 | elapsedTime = etime(clock(), start); 11 | 12 | printf("Elapsed Time: %.3f ms\n", elapsedTime); 13 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/08_nvblas/fft.R: -------------------------------------------------------------------------------- 1 | # FFT using R 2 | 3 | x <- 1:2^30 4 | elapsedTime = system.time({ 5 | fft(fft(x), inverse = TRUE)/length(x) 6 | })[3] 7 | print(sprintf("Elapsed Time: %3.3f ms", elapsedTime)) -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/08_nvblas/nvblas.conf: -------------------------------------------------------------------------------- 1 | #Put here the CPU BLAS fallback Library of your choice 2 | NVBLAS_CPU_BLAS_LIB libopenblas.so 3 | 4 | # Specify which output log file (default is stderr) 5 | NVBLAS_LOGFILE nvblas.log 6 | 7 | # List of GPU devices Id to participate to the computation 8 | # By default if no GPU are listed, only device 0 will be used 9 | NVBLAS_GPU_LIST 0 10 | NVBLAS_AUTOPIN_MEM_ENABLED 11 | -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/08_nvblas/sgemm.R: -------------------------------------------------------------------------------- 1 | # Matrix Multiplication using R 2 | for(i in seq(1:6)) { 3 | N = 512*(2^i) 4 | A = matrix(rnorm(N^2, mean=0, sd=1), nrow=N) 5 | B = matrix(rnorm(N^2, mean=0, sd=1), nrow=N) 6 | elapsedTime = system.time({C = A %*% B})[3] 7 | gFlops = 2*N*N*N/(elapsedTime * 1e+9); 8 | print(sprintf("Elapsed Time [%d]: %3.3f ms, %.3f GFlops", N, elapsedTime, gFlops)) 9 | } -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/08_nvblas/sgemm.m: -------------------------------------------------------------------------------- 1 | # SGEMM 2 | 3 | for i = 1:6 4 | N = 512*(2^i); 5 | A = single(rand(N,N)); 6 | B = single(rand(N,N)); 7 | 8 | start = clock(); 9 | C = A * B; 10 | elapsedTime = etime(clock(), start); 11 | 12 | gFlops = 2*N*N*N/(elapsedTime * 1e+9); 13 | printf("Elapsed Time [%d]: %.3f ms, %.3f GFlops\n", N, elapsedTime, gFlops); 14 | end -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/09_matlab/cuda.m: -------------------------------------------------------------------------------- 1 | N = 8192; 2 | A = single(rand(N,N)); 3 | B = single(rand(N,N)); 4 | 5 | d_A = gpuArray(A); 6 | d_B = gpuArray(B); 7 | 8 | start = clock(); 9 | % C = A * B; 10 | d_C = d_A * d_B; 11 | elapsedTime = etime(clock(), start); 12 | 13 | gFlops = 2*N*N*N/(elapsedTime * 1e+9); 14 | fprintf("Elapsed Time: %.3f ms, %.3f GFlops\n", elapsedTime, gFlops); -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/09_matlab/host.m: -------------------------------------------------------------------------------- 1 | N = 8192; 2 | A = single(rand(N,N)); 3 | B = single(rand(N,N)); 4 | 5 | start = clock(); 6 | C = A * B; 7 | elapsedTime = etime(clock(), start); 8 | 9 | gFlops = 2*N*N*N/(elapsedTime * 1e+9); 10 | fprintf("Elapsed Time: %.3f ms, %.3f GFlops\n", elapsedTime, gFlops); -------------------------------------------------------------------------------- /Chapter08/08_cuda_libs_and_other_languages/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | .PHONY: clean 23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 24 | 25 | test: 26 | echo $(DIRECTORY) 27 | -------------------------------------------------------------------------------- /Chapter09/09_openacc/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | PGCXX=pgc++ 3 | TARGET=merging 4 | 5 | PGCXX_FLAGS= -acc -ta=tesla:managed -Minfo=accel 6 | 7 | all: ${TARGET} 8 | 9 | merging: image_merging.cpp scrImagePgmPpmPackage.cpp 10 | $(EXEC) $(PGCXX) -o $@ $+ 11 | 12 | clean: 13 | rm -f ${TARGET} *.o 14 | -------------------------------------------------------------------------------- /Chapter09/09_openacc/cat.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter09/09_openacc/cat.pgm -------------------------------------------------------------------------------- /Chapter09/09_openacc/dog.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter09/09_openacc/dog.pgm -------------------------------------------------------------------------------- /Chapter09/09_openacc/scrImagePgmPpmPackage.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | 5 | int scr_read_pgm( char* name, unsigned char* image, int irows, int icols ); 6 | void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment ); 7 | int scr_read_ppm( char* name, unsigned char* image, int irows, int icols ); 8 | void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment ); 9 | void get_PgmPpmParams(char * , int *, int *); 10 | void getout_comment(FILE * ); 11 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/01_ann/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=train 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I$(CUDA_PATH)/include 7 | NVCC_FLAGS=-G --resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true -lnvToolsExt 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib -lcublas -lcudnn -lgomp -lcurand 19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES) 20 | 21 | SRC_DIR = src 22 | OBJ_DIR = obj 23 | 24 | all : ${TARGET} 25 | 26 | INCS = ${SRC_DIR}/helper.h ${SRC_DIR}/blob.h ${SRC_DIR}/blob.h ${SRC_DIR}/layer.h 27 | 28 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cpp ${INCS} 29 | $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 30 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cu ${INCS} 31 | $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 32 | 33 | ${OBJ_DIR}/train.o: train.cpp ${INCS} 34 | @mkdir -p $(@D) 35 | $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 36 | 37 | OBJS = ${OBJ_DIR}/train.o ${OBJ_DIR}/mnist.o ${OBJ_DIR}/loss.o ${OBJ_DIR}/layer.o ${OBJ_DIR}/network.o 38 | 39 | train: $(OBJS) 40 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 41 | 42 | .PHONY: clean 43 | clean: 44 | rm -f ${TARGET} ${OBJ_DIR}/*.o 45 | 46 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/01_ann/download_mnist.bat: -------------------------------------------------------------------------------- 1 | echo off 2 | REM setting variables 3 | set zipPath="C:\Program Files\7-Zip\7z.exe" 4 | set train_images="train-images-idx3-ubyte.gz" 5 | set train_labels="train-labels-idx1-ubyte.gz" 6 | set test_images="t10k-images-idx3-ubyte.gz" 7 | set test_labels="t10k-labels-idx1-ubyte.gz" 8 | set url_base="http://yann.lecun.com/exdb/mnist" 9 | 10 | REM check if 7-zip installed 11 | IF NOT EXIST %zipPath% GOTO NO_7ZIP 12 | 13 | REM create dataset folder for the datasets 14 | mkdir dataset 15 | cd dataset 16 | 17 | REM download datasets 18 | curl -O %url_base%/%train_images% 19 | %zipPath% e .\train-images-idx3-ubyte.gz 20 | curl -O %url_base%/train-labels-idx1-ubyte.gz 21 | %zipPath% e .\train-labels-idx1-ubyte.gz 22 | curl -O %url_base%/t10k-images-idx3-ubyte.gz 23 | %zipPath% e .\t10k-images-idx3-ubyte.gz 24 | curl -O %url_base%/t10k-labels-idx1-ubyte.gz 25 | %zipPath% e .\t10k-labels-idx1-ubyte.gz 26 | 27 | exit 28 | 29 | REM exception: no 7-zip found 30 | :NO_7ZIP 31 | echo "Please install 7-zip to extract downloaded MNIST dataset" 32 | exit /b 1 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/01_ann/download_mnist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | url_base=http://yann.lecun.com/exdb/mnist 4 | 5 | mkdir -p dataset 6 | cd dataset 7 | 8 | curl -O ${url_base}/train-images-idx3-ubyte.gz 9 | curl -O ${url_base}/train-labels-idx1-ubyte.gz 10 | curl -O ${url_base}/t10k-images-idx3-ubyte.gz 11 | curl -O ${url_base}/t10k-labels-idx1-ubyte.gz 12 | 13 | gunzip *.gz -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/01_ann/src/loss.h: -------------------------------------------------------------------------------- 1 | #ifndef _LOSS_H_ 2 | #define _LOSS_H_ 3 | 4 | #include "blob.h" 5 | 6 | namespace cudl 7 | { 8 | 9 | class CrossEntropyLoss 10 | { 11 | public: 12 | CrossEntropyLoss(); 13 | ~CrossEntropyLoss(); 14 | 15 | float loss(Blob *predict, Blob *target); 16 | float accuracy(Blob *predict, Blob *target); 17 | 18 | private: 19 | // reduced loss 20 | float h_loss_ = 0.f; 21 | float *d_loss_ = nullptr; 22 | 23 | float *d_workspace_ = nullptr; 24 | void init_workspace(int batch_size); 25 | }; 26 | 27 | } // namespace cudl 28 | 29 | #endif // _LOSS_H_ -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/01_ann/src/network.h: -------------------------------------------------------------------------------- 1 | #ifndef _NETWORK_H_ 2 | #define _NETWORK_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "helper.h" 10 | #include "loss.h" 11 | #include "layer.h" 12 | 13 | namespace cudl { 14 | 15 | typedef enum { 16 | training, 17 | inference 18 | } WorkloadType; 19 | 20 | class Network 21 | { 22 | public: 23 | Network(); 24 | ~Network(); 25 | 26 | void add_layer(Layer *layer); 27 | 28 | Blob *forward(Blob *input); 29 | void backward(Blob *input = nullptr); 30 | void update(float learning_rate = 0.02f); 31 | 32 | int load_pretrain(); 33 | int write_file(); 34 | 35 | float loss(Blob *target); 36 | int get_accuracy(Blob *target); 37 | 38 | void cuda(); 39 | void train(); 40 | void test(); 41 | 42 | Blob *output_; 43 | 44 | std::vector layers(); 45 | 46 | 47 | private: 48 | std::vector layers_; 49 | 50 | CudaContext *cuda_ = nullptr; 51 | 52 | WorkloadType phase_ = inference; 53 | }; 54 | 55 | } // namespace cudl 56 | 57 | 58 | #endif // _NETWORK_H_ -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/02_cnn/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=train 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I$(CUDA_PATH)/include 7 | NVCC_FLAGS=-G --resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true -lnvToolsExt 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib -lcublas -lcudnn -lgomp -lcurand 19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES) 20 | 21 | SRC_DIR = src 22 | OBJ_DIR = obj 23 | 24 | all : ${TARGET} 25 | 26 | INCS = ${SRC_DIR}/helper.h ${SRC_DIR}/blob.h ${SRC_DIR}/blob.h ${SRC_DIR}/layer.h 27 | 28 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cpp ${INCS} 29 | $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 30 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cu ${INCS} 31 | $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 32 | 33 | ${OBJ_DIR}/train.o: train.cpp ${INCS} 34 | @mkdir -p $(@D) 35 | $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@ 36 | 37 | OBJS = ${OBJ_DIR}/train.o ${OBJ_DIR}/mnist.o ${OBJ_DIR}/loss.o ${OBJ_DIR}/layer.o ${OBJ_DIR}/network.o 38 | 39 | train: $(OBJS) 40 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 41 | 42 | .PHONY: clean 43 | clean: 44 | rm -f ${TARGET} ${OBJ_DIR}/*.o 45 | 46 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/02_cnn/download_mnist.bat: -------------------------------------------------------------------------------- 1 | echo off 2 | REM setting variables 3 | set zipPath="C:\Program Files\7-Zip\7z.exe" 4 | set train_images="train-images-idx3-ubyte.gz" 5 | set train_labels="train-labels-idx1-ubyte.gz" 6 | set test_images="t10k-images-idx3-ubyte.gz" 7 | set test_labels="t10k-labels-idx1-ubyte.gz" 8 | set url_base="http://yann.lecun.com/exdb/mnist" 9 | 10 | REM check if 7-zip installed 11 | IF NOT EXIST %zipPath% GOTO NO_7ZIP 12 | 13 | REM create dataset folder for the datasets 14 | mkdir dataset 15 | cd dataset 16 | 17 | REM download datasets 18 | curl -O %url_base%/%train_images% 19 | %zipPath% e .\train-images-idx3-ubyte.gz 20 | curl -O %url_base%/train-labels-idx1-ubyte.gz 21 | %zipPath% e .\train-labels-idx1-ubyte.gz 22 | curl -O %url_base%/t10k-images-idx3-ubyte.gz 23 | %zipPath% e .\t10k-images-idx3-ubyte.gz 24 | curl -O %url_base%/t10k-labels-idx1-ubyte.gz 25 | %zipPath% e .\t10k-labels-idx1-ubyte.gz 26 | 27 | exit 28 | 29 | REM exception: no 7-zip found 30 | :NO_7ZIP 31 | echo "Please install 7-zip to extract downloaded MNIST dataset" 32 | exit /b 1 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/02_cnn/download_mnist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | url_base=http://yann.lecun.com/exdb/mnist 4 | 5 | mkdir -p dataset 6 | cd dataset 7 | 8 | curl -O ${url_base}/train-images-idx3-ubyte.gz 9 | curl -O ${url_base}/train-labels-idx1-ubyte.gz 10 | curl -O ${url_base}/t10k-images-idx3-ubyte.gz 11 | curl -O ${url_base}/t10k-labels-idx1-ubyte.gz 12 | 13 | gunzip *.gz -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/02_cnn/src/loss.h: -------------------------------------------------------------------------------- 1 | #ifndef _LOSS_H_ 2 | #define _LOSS_H_ 3 | 4 | #include "blob.h" 5 | 6 | namespace cudl 7 | { 8 | 9 | class CrossEntropyLoss 10 | { 11 | public: 12 | CrossEntropyLoss(); 13 | ~CrossEntropyLoss(); 14 | 15 | float loss(Blob *predict, Blob *target); 16 | float accuracy(Blob *predict, Blob *target); 17 | 18 | private: 19 | // reduced loss 20 | float h_loss_ = 0.f; 21 | float *d_loss_ = nullptr; 22 | 23 | float *d_workspace_ = nullptr; 24 | void init_workspace(int batch_size); 25 | }; 26 | 27 | } // namespace cudl 28 | 29 | #endif // _LOSS_H_ -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/02_cnn/src/network.h: -------------------------------------------------------------------------------- 1 | #ifndef _NETWORK_H_ 2 | #define _NETWORK_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "helper.h" 10 | #include "loss.h" 11 | #include "layer.h" 12 | 13 | namespace cudl { 14 | 15 | typedef enum { 16 | training, 17 | inference 18 | } WorkloadType; 19 | 20 | class Network 21 | { 22 | public: 23 | Network(); 24 | ~Network(); 25 | 26 | void add_layer(Layer *layer); 27 | 28 | Blob *forward(Blob *input); 29 | void backward(Blob *input = nullptr); 30 | void update(float learning_rate = 0.02f); 31 | 32 | int load_pretrain(); 33 | int write_file(); 34 | 35 | float loss(Blob *target); 36 | int get_accuracy(Blob *target); 37 | 38 | void cuda(); 39 | void train(); 40 | void test(); 41 | 42 | Blob *output_; 43 | 44 | std::vector layers(); 45 | 46 | 47 | private: 48 | std::vector layers_; 49 | 50 | CudaContext *cuda_ = nullptr; 51 | 52 | WorkloadType phase_ = inference; 53 | }; 54 | 55 | } // namespace cudl 56 | 57 | 58 | #endif // _NETWORK_H_ -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/03_rnn/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH=/usr/local/cuda 2 | HOST_COMPILER ?= g++ 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER} 4 | TARGET=rnn 5 | 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I$(CUDA_PATH)/include 7 | NVCC_FLAGS=--resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true 8 | 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0} 10 | 11 | # Gencode argumentes 12 | SMS = 35 37 50 52 60 61 70 75 13 | ifeq "$(IS_CUDA_11)" "1" 14 | SMS = 52 60 61 70 75 80 15 | endif 16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 17 | 18 | LIBRARIES += -L/usr/local/cuda/lib -lcublas -lcudnn -lgomp -lcurand 19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES) 20 | 21 | all : ${TARGET} 22 | 23 | rnn: rnn.cpp 24 | $(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ 25 | 26 | clean: 27 | rm -f ${TARGET} ${OBJ_DIR}/*.o 28 | 29 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/README.md: -------------------------------------------------------------------------------- 1 | PyTorch Training profile with nvprof and NVTX. 2 | 3 | To simplify the working environment configuration, the profile commands depend on NGC PyTorch container. If your working environment is not ready to use NGC, please visit https://ngc.nvidia.com and configure your working environment following the NGC user guide. -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/GPU_1.log: -------------------------------------------------------------------------------- 1 | => creating model '('resnet50', 'classic')' 2 | Version: {'net': , 'block': , 'layers': [3, 4, 6, 3], 'num_classes': 1000} 3 | Config: {'conv': , 'conv_init': 'fan_out', 'nonlinearity': 'relu', 'last_bn_0_init': False, 'activation': at 0x7f5e241dc378>} 4 | WARNING: `HostDecoderRandomCrop` is now deprecated. Use `ImageDecoderRandomCrop` instead 5 | read 1281167 files from 1000 directories 6 | WARNING: `nvJPEGDecoder` is now deprecated. Use `ImageDecoder` instead 7 | read 50000 files from 1000 directories 8 | ! Weight decay NOT applied to BN parameters 9 | 98 10 | 63 11 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_1GPU.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 training in FP16 on 1 GPUs using 256 batch size (256 per GPU) 2 | # Usage ./RN50_FP16_1GPU.sh 3 | 4 | python $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.1 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_4GPU.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU) 2 | # Usage ./RN50_FP16_4GPU.sh 3 | 4 | python $1/multiproc.py --nproc_per_node 4 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.4 --warmup 5 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_8GPU.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 training in FP16 on 8 GPUs using 2048 batch size (256 per GPU) 2 | # Usage ./RN50_FP16_8GPU.sh 3 | 4 | python $1/multiproc.py --nproc_per_node 8 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.8 --warmup 5 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_EVAL.sh: -------------------------------------------------------------------------------- 1 | # This script evaluates ResNet50 model in FP16 using 64 batch size on 1 GPU 2 | # Usage: ./RN50_FP16_EVAL.sh 3 | 4 | python $1/main.py -j5 p 100 --arch resnet50 -b 256 --resume $2 --evaluate --fp16 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_INFERENCE_BENCHMARK.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 256 batch size 2 | 3 | python ./main.py -j5 --arch resnet50 -b 256 --fp16 --benchmark-inference /data/imagenet 4 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_1GPU.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 training in FP32 on 1 GPUs using 128 batch size (128 per GPU) 2 | # Usage ./RN50_FP32_1GPU.sh 3 | 4 | python $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.05 --epochs 90 $2 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_4GPU.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 training in FP32 on 4 GPUs using 512 batch size (128 per GPU) 2 | # Usage ./RN50_FP32_4GPU.sh 3 | 4 | python $1/multiproc.py --nproc_per_node 4 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.2 --warmup 5 --epochs 90 $2 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_8GPU.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU) 2 | # Usage ./RN50_FP32_8GPU.sh 3 | 4 | python $1/multiproc.py --nproc_per_node 8 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.4 --warmup 5 --epochs 90 $2 /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_EVAL.sh: -------------------------------------------------------------------------------- 1 | # This script evaluates ResNet50 model in FP32 using 64 batch size on 1 GPU 2 | # Usage: ./RN50_FP32_EVAL.sh 3 | 4 | python $1/main.py -j5 p 100 --arch resnet50 -b 128 --resume $2 --evaluate /data/imagenet 5 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_INFERENCE_BENCHMARK.sh: -------------------------------------------------------------------------------- 1 | # This script launches ResNet50 inference benchmark in FP32 on 1 GPU with 128 batch size 2 | 3 | python ./main.py -j5 --arch resnet50 -b 128 --benchmark-inference /data/imagenet 4 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/image_classification/__init__.py: -------------------------------------------------------------------------------- 1 | from . import logger 2 | from . import dataloaders 3 | from . import training 4 | from . import utils 5 | from . import mixup 6 | from . import resnet 7 | from . import smoothing 8 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/image_classification/smoothing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class LabelSmoothing(nn.Module): 5 | """ 6 | NLL loss with label smoothing. 7 | """ 8 | def __init__(self, smoothing=0.0): 9 | """ 10 | Constructor for the LabelSmoothing module. 11 | 12 | :param smoothing: label smoothing factor 13 | """ 14 | super(LabelSmoothing, self).__init__() 15 | self.confidence = 1.0 - smoothing 16 | self.smoothing = smoothing 17 | 18 | def forward(self, x, target): 19 | logprobs = torch.nn.functional.log_softmax(x, dim=-1) 20 | 21 | nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) 22 | nll_loss = nll_loss.squeeze(1) 23 | smooth_loss = -logprobs.mean(dim=-1) 24 | loss = self.confidence * nll_loss + self.smoothing * smooth_loss 25 | return loss.mean() 26 | 27 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/.gitkeep -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_loss.png -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top1.png -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top5.png -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_accuracy.png -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_loss.png -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/validation_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/validation_accuracy.png -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt.qdrep -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt_2g.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt_2g.qdrep -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/README.md -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP16_250E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP16_50E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 50 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP16_90E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 90 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP32_250E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 250 --mixup 0.2 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP32_50E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 50 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP32_90E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 90 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP16_250E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP16_50E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 50 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP16_90E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 90 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP32_250E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 250 --mixup 0.2 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP32_50E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 50 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP32_90E.sh: -------------------------------------------------------------------------------- 1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 90 /data/imagenet -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/test.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/test.qdrep -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/pytorch/nsys-nvtx.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | CODE_PATH="RN50v1.5" 4 | DATASET_PATH="/raid/datasets/imagenet/raw-data/" 5 | OUTPUT_NAME="resnet50_pyt" 6 | 7 | # default profile 8 | docker run --rm -ti --runtime=nvidia \ 9 | -v $(pwd)/${CODE_PATH}:/workspace \ 10 | -v ${DATASET_PATH}:/imagenet \ 11 | nvcr.io/nvidia/pytorch:19.08-py3 \ 12 | nsys profile -t cuda,nvtx,cudnn,cublas -o ${OUTPUT_NAME} -f true -w true -y 60 -d 20 \ 13 | python /workspace/main.py --arch resnet50 -b 64 --fp16 /imagenet 14 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/baseline.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/baseline.qdrep -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/dllogger/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/dllogger/__init__.py -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from model import layers 16 | from model import blocks 17 | from model import resnet_v1_5 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | from model.blocks.conv2d_block import conv2d_block 20 | from model.blocks.resnet_bottleneck_block import bottleneck_block 21 | 22 | __all__ = [ 23 | 24 | # conv + bn + act block 25 | 'conv2d_block', 26 | 27 | # resnet blocks 28 | 'bottleneck_block' 29 | ] 30 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env python 16 | # -*- coding: utf-8 -*- 17 | 18 | from model.layers.activation import relu 19 | from model.layers.activation import softmax 20 | from model.layers.activation import tanh 21 | 22 | from model.layers.conv2d import conv2d 23 | 24 | from model.layers.dense import dense 25 | 26 | from model.layers.math_ops import reduce_mean 27 | 28 | from model.layers.normalization import batch_norm 29 | 30 | from model.layers.padding import pad 31 | 32 | from model.layers.pooling import average_pooling2d 33 | from model.layers.pooling import max_pooling2d 34 | 35 | __all__ = [ 36 | 37 | # activation layers 38 | 'relu', 39 | 'softmax', 40 | 'tanh', 41 | 42 | # conv layers 43 | 'conv2d', 44 | 45 | # dense layers 46 | 'dense', 47 | 48 | # math_ops layers 49 | 'reduce_mean', 50 | 51 | # normalization layers 52 | 'batch_norm', 53 | 54 | # padding layers 55 | 'pad', 56 | 57 | # pooling layers 58 | 'average_pooling2d', 59 | 'max_pooling2d' 60 | ] 61 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env python 16 | # -*- coding: utf-8 -*- 17 | 18 | import tensorflow as tf 19 | 20 | __all__ = ['relu', 'softmax', 'tanh'] 21 | 22 | 23 | def relu(inputs, name='relu'): 24 | 25 | net = tf.nn.relu(inputs, name=name) 26 | 27 | return net 28 | 29 | 30 | def softmax(inputs, axis=None, name="softmax"): 31 | 32 | net = tf.nn.softmax( 33 | inputs, 34 | axis=axis, 35 | name=name, 36 | ) 37 | 38 | return net 39 | 40 | 41 | def tanh(inputs, name='tanh'): 42 | 43 | net = tf.math.tanh(inputs, name=name) 44 | 45 | return net 46 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/dense.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import tensorflow as tf 16 | 17 | __all__ = ['dense'] 18 | 19 | 20 | def dense( 21 | inputs, 22 | units, 23 | use_bias=True, 24 | trainable=True, 25 | kernel_initializer=tf.variance_scaling_initializer(), 26 | bias_initializer=tf.zeros_initializer() 27 | ): 28 | 29 | net = tf.layers.dense( 30 | inputs, 31 | units=units, 32 | activation=None, 33 | use_bias=use_bias, 34 | kernel_initializer=kernel_initializer, 35 | bias_initializer=bias_initializer, 36 | trainable=trainable 37 | ) 38 | 39 | return net 40 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/math_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env python 16 | # -*- coding: utf-8 -*- 17 | 18 | import tensorflow as tf 19 | 20 | __all__ = ['reduce_mean'] 21 | 22 | 23 | def reduce_mean(inputs, keepdims=None, data_format='channels_last', name='spatial_mean'): 24 | 25 | if data_format not in ['NHWC', 'NCHW']: 26 | raise ValueError("Unknown data format: `%s` (accepted: ['NHWC', 'NCHW'])" % data_format) 27 | 28 | axes = [1, 2] if data_format == 'NHWC' else [2, 3] 29 | 30 | net = tf.math.reduce_mean(inputs, axis=axes, keepdims=keepdims, name=name) 31 | 32 | return net 33 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/padding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/usr/bin/env python 16 | # -*- coding: utf-8 -*- 17 | 18 | import tensorflow as tf 19 | 20 | __all__ = ['pad'] 21 | 22 | 23 | def pad(inputs, paddings, mode='CONSTANT', name='padding', constant_values=0): 24 | 25 | if mode.upper() not in ['CONSTANT', 'REFLECT', 'SYMMETRIC']: 26 | raise ValueError("Unknown padding mode: `%s` (accepted: ['CONSTANT', 'REFLECT', 'SYMMETRIC'])" % mode) 27 | 28 | net = tf.pad(inputs, paddings=paddings, mode=mode, name=name, constant_values=constant_values) 29 | 30 | return net 31 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/requirements.txt -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/events.out.tfevents.1566195554.5b8c84c05f4e: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/events.out.tfevents.1566195554.5b8c84c05f4e -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-1000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-1000.index -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-2000.data-00001-of-00002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-2000.data-00001-of-00002 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | from runtime.runner import Runner -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_16GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP16 on 16 GPUs using 4096 batch size (256 per GPU) 16 | # Usage ./RN50_FP16_16GPU.sh 17 | 18 | mpiexec --allow-run-as-root --bind-to socket -np 16 \ 19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_1GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP16 on 1 GPUs using 256 batch size (256 per GPU) 16 | # Usage ./RN50_FP16_1GPU.sh 17 | 18 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_4GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU) 16 | # Usage ./RN50_FP16_4GPU.sh 17 | 18 | mpiexec --allow-run-as-root --bind-to socket -np 4 \ 19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_8GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP16 on 8 GPUs using 2048 batch size (256 per GPU) 16 | # Usage ./RN50_FP16_8GPU.sh 17 | 18 | mpiexec --allow-run-as-root --bind-to socket -np 8 \ 19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_EVAL.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script evaluates ResNet50 model in FP16 using 256 batch size on 1 GPU 16 | # Usage: ./RN50_FP16_EVAL.sh 17 | 18 | python $1/main.py --mode=evaluate --data_dir=$2 --batch_size=256 --num_iter=1 --iter_unit=epoch --use_tf_amp --results_dir=$3 19 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_16GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP32 on 16 GPUs using 2048 batch size (128 per GPU) 16 | ## Usage ./RN50_FP32_16GPU.sh 17 | 18 | mpiexec --allow-run-as-root --bind-to socket -np 16 \ 19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_1GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP32 on 1 GPUs using 128 batch size (128 per GPU) 16 | # Usage ./RN50_FP32_1GPU.sh 17 | 18 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_4GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP32 on 4 GPUs using 512 batch size (128 per GPU) 16 | # Usage ./RN50_FP32_4GPU.sh 17 | 18 | mpiexec --allow-run-as-root --bind-to socket -np 4 \ 19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_8GPU.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script launches ResNet50 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU) 16 | ## Usage ./RN50_FP32_8GPU.sh 17 | 18 | mpiexec --allow-run-as-root --bind-to socket -np 8 \ 19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3 -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_EVAL.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This script evaluates ResNet50 model in FP32 using 128 batch size on 1 GPU 16 | # Usage: ./RN50_FP32_EVAL.sh 17 | 18 | python $1/main.py --mode=evaluate --data_dir=$2 --batch_size=128 --num_iter=1 --iter_unit=epoch --results_dir=$3 19 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_inferbench_fp16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp16.json --perf_args "use_tf_amp" --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 192 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp16.json --perf_args "use_tf_amp" "use_xla" --data_dir $1 --results_dir $2/xla 8 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_inferbench_fp32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp32.json --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 96 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_trainbench_fp16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp16.json --data_dir $1 --perf_args "use_tf_amp" --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 32 64 128 192 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp16.json --perf_args "use_xla" "use_tf_amp" --data_dir $1 --results_dir $2/xla 8 | 9 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_trainbench_fp32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp32.json --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 32 64 96 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla 8 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_inferbench_fp16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp16.json --perf_args "use_tf_amp" --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp16.json --perf_args "use_xla" "use_tf_amp" --data_dir $1 --results_dir $2/xla -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_inferbench_fp32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp32.json --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_trainbench_fp16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp16.json --perf_args "use_tf_amp" --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp16.json --perf_args "use_xla" "use_tf_amp" --data_dir $1 --results_dir $2/xla -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_trainbench_fp32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/results 4 | 5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp32.json --data_dir $1 --results_dir $2 6 | 7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "16": { 8 | "total_ips": 1300.0 9 | }, 10 | "32": { 11 | "total_ips": 1600.0 12 | }, 13 | "1": { 14 | "total_ips": 160.0 15 | }, 16 | "2": { 17 | "total_ips": 320.0 18 | }, 19 | "64": { 20 | "total_ips": 1800.0 21 | }, 22 | "4": { 23 | "total_ips": 550.0 24 | }, 25 | "128": { 26 | "total_ips": 1950.0 27 | }, 28 | "8": { 29 | "total_ips": 950.0 30 | }, 31 | "256": { 32 | "total_ips": 2050.0 33 | } 34 | } 35 | }, 36 | "model": "", 37 | "ngpus": [ 38 | 1 39 | ], 40 | "bs": [ 41 | 1, 42 | 2, 43 | 4, 44 | 8, 45 | 16, 46 | 32, 47 | 64, 48 | 128, 49 | 256 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp32.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "16": { 8 | "total_ips": 800.0 9 | }, 10 | "32": { 11 | "total_ips": 920.0 12 | }, 13 | "1": { 14 | "total_ips": 150.0 15 | }, 16 | "2": { 17 | "total_ips": 270.0 18 | }, 19 | "64": { 20 | "total_ips": 1000.0 21 | }, 22 | "4": { 23 | "total_ips": 450.0 24 | }, 25 | "128": { 26 | "total_ips": 1075.0 27 | }, 28 | "8": { 29 | "total_ips": 650.0 30 | } 31 | } 32 | }, 33 | "model": "", 34 | "ngpus": [ 35 | 1 36 | ], 37 | "bs": [ 38 | 1, 39 | 2, 40 | 4, 41 | 8, 42 | 16, 43 | 32, 44 | 64, 45 | 128 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "64": { 8 | "total_ips": 630.0 9 | }, 10 | "128": { 11 | "total_ips": 710.0 12 | }, 13 | "256": { 14 | "total_ips": 750.0 15 | } 16 | }, 17 | "4": { 18 | "64": { 19 | "total_ips": 2250.0 20 | }, 21 | "128": { 22 | "total_ips": 2600.0 23 | }, 24 | "256": { 25 | "total_ips": 2900.0 26 | } 27 | }, 28 | "8": { 29 | "64": { 30 | "total_ips": 4500.0 31 | }, 32 | "128": { 33 | "total_ips": 5300.0 34 | }, 35 | "256": { 36 | "total_ips": 5800.0 37 | } 38 | } 39 | }, 40 | "model": "", 41 | "ngpus": [ 42 | 1, 43 | 4, 44 | 8 45 | ], 46 | "bs": [ 47 | 64, 48 | 128, 49 | 256 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp32.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "32": { 8 | "total_ips": 300.0 9 | }, 10 | "64": { 11 | "total_ips": 330.0 12 | }, 13 | "128": { 14 | "total_ips": 350.0 15 | } 16 | }, 17 | "4": { 18 | "32": { 19 | "total_ips": 1050.0 20 | }, 21 | "64": { 22 | "total_ips": 1250.0 23 | }, 24 | "128": { 25 | "total_ips": 1350.0 26 | } 27 | }, 28 | "8": { 29 | "32": { 30 | "total_ips": 2100.0 31 | }, 32 | "64": { 33 | "total_ips": 2500.0 34 | }, 35 | "128": { 36 | "total_ips": 2700.0 37 | } 38 | } 39 | }, 40 | "model": "", 41 | "ngpus": [ 42 | 1, 43 | 4, 44 | 8 45 | ], 46 | "bs": [ 47 | 32, 48 | 64, 49 | 128 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "16": { 8 | "total_ips": 1300.0 9 | }, 10 | "32": { 11 | "total_ips": 1600.0 12 | }, 13 | "1": { 14 | "total_ips": 160.0 15 | }, 16 | "2": { 17 | "total_ips": 320.0 18 | }, 19 | "64": { 20 | "total_ips": 1800.0 21 | }, 22 | "4": { 23 | "total_ips": 550.0 24 | }, 25 | "128": { 26 | "total_ips": 1950.0 27 | }, 28 | "8": { 29 | "total_ips": 950.0 30 | }, 31 | "256": { 32 | "total_ips": 2050.0 33 | } 34 | } 35 | }, 36 | "model": "", 37 | "ngpus": [ 38 | 1 39 | ], 40 | "bs": [ 41 | 1, 42 | 2, 43 | 4, 44 | 8, 45 | 16, 46 | 32, 47 | 64, 48 | 128, 49 | 256 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp32.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "16": { 8 | "total_ips": 800.0 9 | }, 10 | "32": { 11 | "total_ips": 920.0 12 | }, 13 | "1": { 14 | "total_ips": 150.0 15 | }, 16 | "2": { 17 | "total_ips": 270.0 18 | }, 19 | "64": { 20 | "total_ips": 1000.0 21 | }, 22 | "4": { 23 | "total_ips": 450.0 24 | }, 25 | "128": { 26 | "total_ips": 1075.0 27 | }, 28 | "8": { 29 | "total_ips": 650.0 30 | } 31 | } 32 | }, 33 | "model": "", 34 | "ngpus": [ 35 | 1 36 | ], 37 | "bs": [ 38 | 1, 39 | 2, 40 | 4, 41 | 8, 42 | 16, 43 | 32, 44 | 64, 45 | 128 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "64": { 8 | "total_ips": 630.0 9 | }, 10 | "128": { 11 | "total_ips": 710.0 12 | }, 13 | "256": { 14 | "total_ips": 750.0 15 | } 16 | }, 17 | "4": { 18 | "64": { 19 | "total_ips": 2250.0 20 | }, 21 | "128": { 22 | "total_ips": 2600.0 23 | }, 24 | "256": { 25 | "total_ips": 2900.0 26 | } 27 | }, 28 | "8": { 29 | "64": { 30 | "total_ips": 4650.0 31 | }, 32 | "128": { 33 | "total_ips": 5500.0 34 | }, 35 | "256": { 36 | "total_ips": 6000.0 37 | } 38 | }, 39 | "16": { 40 | "64": { 41 | "total_ips": 9000.0 42 | }, 43 | "128": { 44 | "total_ips": 10500.0 45 | }, 46 | "256": { 47 | "total_ips": 11500.0 48 | } 49 | } 50 | }, 51 | "model": "", 52 | "ngpus": [ 53 | 1, 54 | 4, 55 | 8, 56 | 16 57 | ], 58 | "bs": [ 59 | 64, 60 | 128, 61 | 256 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp32.json: -------------------------------------------------------------------------------- 1 | { 2 | "metric_keys": [ 3 | "total_ips" 4 | ], 5 | "metrics": { 6 | "1": { 7 | "32": { 8 | "total_ips": 300.0 9 | }, 10 | "64": { 11 | "total_ips": 330.0 12 | }, 13 | "128": { 14 | "total_ips": 350.0 15 | } 16 | }, 17 | "4": { 18 | "32": { 19 | "total_ips": 1050.0 20 | }, 21 | "64": { 22 | "total_ips": 1250.0 23 | }, 24 | "128": { 25 | "total_ips": 1350.0 26 | } 27 | }, 28 | "8": { 29 | "32": { 30 | "total_ips": 2100.0 31 | }, 32 | "64": { 33 | "total_ips": 2500.0 34 | }, 35 | "128": { 36 | "total_ips": 2700.0 37 | } 38 | }, 39 | "16": { 40 | "32": { 41 | "total_ips": 4100.0 42 | }, 43 | "64": { 44 | "total_ips": 5100.0 45 | }, 46 | "128": { 47 | "total_ips": 5500.0 48 | } 49 | } 50 | }, 51 | "model": "", 52 | "ngpus": [ 53 | 1, 54 | 4, 55 | 8, 56 | 16 57 | ], 58 | "bs": [ 59 | 32, 60 | 64, 61 | 128 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build . --rm -t rn50v15_tf 4 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/docker/interactive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nvidia-docker run -it --rm --ipc=host --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -v $PWD:/workspace/rn50v15_tf/ rn50v15_tf bash 4 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | 19 | from utils import hooks 20 | 21 | from utils import var_storage 22 | 23 | from utils import cmdline_helper 24 | 25 | from utils import data_utils 26 | from utils import image_processing 27 | 28 | from utils import learning_rate 29 | 30 | from utils import dali_utils 31 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from utils.hooks.training_hooks import * 5 | from utils.hooks.benchmark_hooks import * 6 | from utils.hooks.prefill_hook import * 7 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/hooks/prefill_hook.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import time 19 | 20 | import numpy as np 21 | import tensorflow as tf 22 | 23 | __all__ = ['PrefillStagingAreasHook'] 24 | 25 | 26 | class PrefillStagingAreasHook(tf.train.SessionRunHook): 27 | 28 | def after_create_session(self, session, coord): 29 | # TODO: This assumes TF collections are ordered; is this safe? 30 | enqueue_ops = tf.get_collection('STAGING_AREA_PUTS') 31 | for i in range(len(enqueue_ops)): 32 | session.run(enqueue_ops[:i + 1]) 33 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/hvd_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | 20 | __all__ = ["is_using_hvd"] 21 | 22 | 23 | def is_using_hvd(): 24 | env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"] 25 | 26 | if all([var in os.environ for var in env_vars]): 27 | return True 28 | else: 29 | return False 30 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/nsys-nvtx-2g.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | CODE_PATH="RN50v1.5" 4 | DATASET_PATH="/raid/datasets/imagenet/tfrecord" 5 | OUTPUT_NAME="resnet50_tf" 6 | 7 | # default profile 8 | docker run --rm -ti --runtime=nvidia \ 9 | -v $(pwd):/result \ 10 | -v $(pwd)/${CODE_PATH}:/workspace \ 11 | -v ${DATASET_PATH}:/imagenet \ 12 | --ipc host --net host \ 13 | nvcr.io/nvidia/tensorflow:19.08-py3 \ 14 | nsys profile -t cuda,nvtx,cudnn,cublas -o ${OUTPUT_NAME} -f true -w true -y 40 -d 20 \ 15 | mpiexec --allow-run-as-root --bind-to socket -np 2 \ 16 | python /workspace/main.py --data_dir=/imagenet --mode=training_benchmark --warmup_steps 200 \ 17 | --num_iter 500 --iter_unit batch --results_dir=results --batch_size 64 18 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/04_framework_profile/tensorflow/nsys-nvtx.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | CODE_PATH="RN50v1.5" 4 | DATASET_PATH="/raid/datasets/imagenet/tfrecord" 5 | OUTPUT_NAME="resnet50_tf" 6 | 7 | # default profile 8 | docker run --rm -ti --runtime=nvidia \ 9 | -v $(pwd):/result \ 10 | -v $(pwd)/${CODE_PATH}:/workspace \ 11 | -v ${DATASET_PATH}:/imagenet \ 12 | nvcr.io/nvidia/tensorflow:19.08-py3 \ 13 | nsys profile -t cuda,nvtx,cudnn,cublas -o ${OUTPUT_NAME} -f true -w true -y 40 -d 20 \ 14 | python /workspace/main.py --data_dir=/imagenet --mode=training_benchmark --warmup_steps 200 \ 15 | --num_iter 500 --iter_unit batch --results_dir=results --batch_size 64 16 | -------------------------------------------------------------------------------- /Chapter10/10_deep_learning/Makefile: -------------------------------------------------------------------------------- 1 | # Project folders that contain CUDA receipts 2 | PROJECTS ?= $(shell find \ 3 | $(shell ls -d */) \ 4 | -name Makefile) 5 | 6 | %.ph_build: 7 | +@$(MAKE) -C $(dir $*) $(MAKECMDGOALS) 8 | 9 | %.ph_clean: 10 | +@$(MAKE) -C $(dir $*) clean $(USE_DEVICE) 11 | 12 | all: $(addsuffix .ph_build,$(PROJECTS)) 13 | @echo "Finished building CUDA Receipts" 14 | 15 | build: $(addsuffix .ph_build,$(PROJECTS)) 16 | 17 | tidy: 18 | @find * | egrep "#" | xargs rm -f 19 | @find * | egrep "\~" | xargs rm -f 20 | @find * | egrep "nvvp" | xargs rm -f 21 | 22 | .PHONY: clean 23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS)) 24 | 25 | test: 26 | echo $(DIRECTORY) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | --------------------------------------------------------------------------------