├── .gitignore
├── Chapter01
    └── 01_cuda_introduction
    │   ├── 01_hello_world
    │       ├── Makefile
    │       └── hello_world.cu
    │   ├── 02_vector_addition
    │       ├── Makefile
    │       ├── vector_addition.cu
    │       ├── vector_addition_gpu_block_only.cu
    │       ├── vector_addition_gpu_thread_block.cu
    │       └── vector_addition_gpu_thread_only.cu
    │   └── Makefile
├── Chapter02
    └── 02_memory_overview
    │   ├── 01_sgemm
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 02_vector_addition
    │       ├── Makefile
    │       └── vector_addition_gpu_thread_block.cu
    │   ├── 03_aos_soa
    │       ├── Makefile
    │       ├── aos.cu
    │       └── soa.cu
    │   ├── 04_matrix_transpose
    │       ├── Makefile
    │       ├── conflict_solved.cu
    │       └── matrix_transpose.cu
    │   ├── 05_image_scaling
    │       ├── Makefile
    │       ├── aerosmith-double.pgm
    │       ├── image_scaling.cu
    │       ├── scrImagePgmPpmPackage.cpp
    │       ├── scrImagePgmPpmPackage.h
    │       └── voyager2.pgm
    │   ├── 06_unified_memory
    │       ├── Makefile
    │       ├── unified_memory.cu
    │       ├── unified_memory_64align.cu
    │       ├── unified_memory_initialized.cu
    │       └── unified_memory_prefetch.cu
    │   └── Makefile
├── Chapter03
    └── 03_cuda_thread_programming
    │   ├── 01_warp_and_thread_block
    │       ├── Makefile
    │       └── cuda_thread_block.cu
    │   ├── 02_cuda_occupancy
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 03_threadsync_and_reduction
    │       ├── Makefile
    │       ├── reduction.h
    │       ├── reduction_global.cpp
    │       ├── reduction_global_kernel.cu
    │       ├── reduction_shared.cpp
    │       └── reduction_shared_kernel.cu
    │   ├── 04_performance_limiter
    │       ├── Makefile
    │       ├── reduction.h
    │       ├── reduction_shared.cpp
    │       ├── reduction_shared_kernel.cu
    │       └── sgemm.cu
    │   ├── 05_warp_divergence
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       ├── reduction_kernel_interleaving.cu
    │       └── reduction_kernel_sequential.cu
    │   ├── 06_limiter_balancing
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       ├── reduction_kernel.cu
    │       └── reduction_kernel_opt.cu
    │   ├── 07_warp_synchronous_programming
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       └── reduction_wp_kernel.cu
    │   ├── 08_cooperative_group
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       ├── reduction_cg_kernel.cu
    │       └── reduction_cg_shift_kernel.cu
    │   ├── 09_loop_unrolling
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       ├── reduction_cg_kernel.cu
    │       └── reduction_wp_kernel.cu
    │   ├── 10_atomic_operation
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       ├── reduction_blk_atmc_kernel.cu
    │       ├── reduction_kernel.cu
    │       └── reduction_wrp_atmc_kernel.cu
    │   ├── 11_mixed_precision_operation
    │       ├── Makefile
    │       ├── mixed_precision.cu
    │       ├── mixed_precision_half.cu
    │       ├── mixed_precision_int.cu
    │       └── util.cuh
    │   └── Makefile
├── Chapter04
    └── 04_kernel_execution
    │   ├── 01_cuda_stream
    │       ├── 1_cuda_default_stream.cu
    │       ├── 2_cuda_multi_stream.cu
    │       ├── 3_cuda_multi_stream_with_sync.cu
    │       ├── 4_cuda_multi_stream_with_default.cu
    │       └── Makefile
    │   ├── 02_pipelining
    │       ├── Makefile
    │       └── cuda_pipelining.cu
    │   ├── 03_cuda_callback
    │       ├── Makefile
    │       └── cuda_callback.cu
    │   ├── 04_stream_priority
    │       ├── Makefile
    │       └── prioritized_cuda_stream.cu
    │   ├── 05_cuda_event
    │       ├── Makefile
    │       ├── cuda_event.cu
    │       └── cuda_event_with_streams.cu
    │   ├── 06_dynamic_parallelism
    │       ├── Makefile
    │       ├── dynamic_parallelism.cu
    │       └── recursion.cu
    │   ├── 07_grid_level_cg
    │       ├── Makefile
    │       ├── reduction.cpp
    │       ├── reduction.h
    │       └── reduction_kernel.cu
    │   ├── 08_openmp_cuda
    │       ├── Makefile
    │       ├── openmp.cu
    │       ├── openmp_default_stream.cu
    │       └── openmp_gpus.cu
    │   ├── 09_mps
    │       ├── Makefile
    │       ├── install_mpi.sh
    │       └── simpleMPI.cu
    │   ├── 10_kernel_execution_overhead
    │       ├── Makefile
    │       └── cuda_kernel.cu
    │   └── Makefile
├── Chapter05
    └── 05_debug_profiling
    │   ├── .gitignore
    │   ├── 01_focused_profile
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 02_nvtx
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 03_cuda_error
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 04_cuda_assert
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 05_debug_with_vs
    │       ├── debug_vs.vcxproj
    │       └── simple_sgemm.cu
    │   ├── 06_debug_with_eclipse
    │       ├── .cproject
    │       ├── .gitignore
    │       ├── .project
    │       ├── .settings
    │       │   └── org.eclipse.ltk.core.refactoring.prefs
    │       ├── Debug
    │       │   ├── makefile
    │       │   ├── objects.mk
    │       │   ├── sources.mk
    │       │   └── src
    │       │   │   ├── simple_sgemm.d
    │       │   │   └── subdir.mk
    │       ├── java-7-install.md
    │       └── src
    │       │   └── simple_sgemm.cu
    │   ├── 07_debug_with_gdb
    │       ├── Makefile
    │       └── simple_sgemm.cu
    │   ├── 08_memcheck
    │       ├── Makefile
    │       ├── simple_sgemm.cu
    │       ├── simple_sgemm_mem_leak.cu
    │       └── simple_sgemm_oob.cu
    │   └── Makefile
├── Chapter06
    └── 06_multigpu
    │   ├── 01_gaussian_single_gpu
    │       ├── config.h
    │       ├── cpuSolver.h
    │       ├── gaussian_sequential.cu
    │       ├── gaussian_single_gpu.cu
    │       ├── gaussian_single_gpu.h
    │       ├── gpuSolver.cu
    │       ├── gpuSolverFunctions.cu
    │       ├── linearSystemOps.cu
    │       └── linearSystemOps.h
    │   ├── 02_gaussian_multi_gpu
    │       ├── Makefile
    │       ├── config.h
    │       ├── gaussian_multi_gpu_p2p.cu
    │       ├── gaussian_multi_gpu_p2p.h
    │       ├── gpuSolver.cu
    │       ├── gpuSolverFunctions.cu
    │       ├── linearSystemOps.cu
    │       └── utilities.cu
    │   ├── 03_helloWorldMPI
    │       └── helloWorldMPI.c
    │   ├── 04_gaussian_multi_node
    │       ├── Makefile
    │       ├── config.h
    │       ├── elementUtilities.cu
    │       ├── gaussian_multi_gpu_rdma.c
    │       ├── gaussian_multi_gpu_rdma.h
    │       ├── gpuSolver.cu
    │       ├── gpuSolverFunctions.cu
    │       ├── linearSystemOps.c
    │       ├── mpiUtils.h
    │       └── utilities.cu
    │   ├── 05_streams
    │       ├── Makefile
    │       ├── cat.pgm
    │       ├── dog.pgm
    │       ├── image_merging.cu
    │       ├── scrImagePgmPpmPackage.cu
    │       ├── scrImagePgmPpmPackage.h
    │       └── vector_addition.cu
    │   ├── 06_nccl
    │       ├── Makefile
    │       └── nccl.cu
    │   └── Makefile
├── Chapter07
    └── 07_parallel_programming_pattern
    │   ├── 01_sgemm_optimization
    │       ├── Makefile
    │       └── sgemm.cu
    │   ├── 02_convolution
    │       ├── Makefile
    │       └── convolution.cu
    │   ├── 03_scan
    │       ├── Makefile
    │       ├── scan.cu
    │       ├── scan.h
    │       ├── scan_v1.cu
    │       ├── scan_v2.cu
    │       └── utils.h
    │   ├── 04_pack_n_split
    │       ├── Makefile
    │       └── pack_n_split.cu
    │   ├── 05_n-body
    │       ├── Makefile
    │       ├── n_body.cu
    │       └── n_body.h
    │   ├── 06_quicksort
    │       ├── Makefile
    │       └── quick_sort.cu
    │   ├── 07_radixsort
    │       ├── Makefile
    │       ├── radix_warp_sort.cu
    │       └── thrust_radix_sort.cu
    │   ├── 08_histogram
    │       ├── Makefile
    │       ├── aerosmith-double.pgm
    │       ├── image_histogram.cu
    │       ├── scrImagePgmPpmPackage.cpp
    │       └── scrImagePgmPpmPackage.h
    │   └── Makefile
├── Chapter08
    └── 08_cuda_libs_and_other_languages
    │   ├── 01_sgemm
    │       ├── Makefile
    │       ├── cublasSgemm.cpp
    │       ├── cublasSgemm_async.cpp
    │       └── cublasXtSgemm.cpp
    │   ├── 02_sgemm_mixed_precision
    │       ├── Makefile
    │       ├── cublasGemmEx.cu
    │       └── helper.cuh
    │   ├── 03_curand
    │       ├── Makefile
    │       ├── curand_device.cu
    │       ├── curand_host.cpp
    │       ├── fp16.cu
    │       ├── fp16.cuh
    │       └── gemm_with_curand_host.cpp
    │   ├── 04_cufft
    │       ├── Makefile
    │       ├── complex.cu
    │       ├── cufft.1d.cpp
    │       ├── cufft.half.cpp
    │       ├── cufft.mgpu.cu
    │       ├── fp16.cu
    │       ├── fp16.cuh
    │       └── helper.cuh
    │   ├── 05_npp
    │       ├── Makefile
    │       ├── flower.jpg
    │       ├── imageFilter.cpp
    │       ├── output.jpg
    │       └── statisticsNPP.cpp
    │   ├── 06_opencv
    │       ├── Makefile
    │       ├── blur.cpp
    │       ├── blur_cuvid.cpp
    │       ├── blur_stream.cpp
    │       ├── flower.JPG
    │       ├── install_opencv.sh
    │       └── test.cpp
    │   ├── 07_python_cuda
    │       ├── cupy_op.py
    │       ├── numba_matmul.py
    │       ├── numba_saxpy.py
    │       ├── pycuda_matmul.py
    │       └── pycuda_matmul_simple.py
    │   ├── 08_nvblas
    │       ├── exec_fft.m
    │       ├── fft.R
    │       ├── nvblas.conf
    │       ├── sgemm.R
    │       └── sgemm.m
    │   ├── 09_matlab
    │       ├── cuda.m
    │       └── host.m
    │   └── Makefile
├── Chapter09
    └── 09_openacc
    │   ├── Makefile
    │   ├── cat.pgm
    │   ├── dog.pgm
    │   ├── image_merging.cpp
    │   ├── scrImagePgmPpmPackage.cpp
    │   └── scrImagePgmPpmPackage.h
├── Chapter10
    └── 10_deep_learning
    │   ├── 01_ann
    │       ├── Makefile
    │       ├── ann.vcxproj
    │       ├── download_mnist.bat
    │       ├── download_mnist.sh
    │       ├── src
    │       │   ├── blob.h
    │       │   ├── helper.h
    │       │   ├── layer.cu
    │       │   ├── layer.h
    │       │   ├── loss.cu
    │       │   ├── loss.h
    │       │   ├── mnist.cpp
    │       │   ├── mnist.h
    │       │   ├── network.cpp
    │       │   └── network.h
    │       └── train.cpp
    │   ├── 02_cnn
    │       ├── Makefile
    │       ├── cnn.vcxproj
    │       ├── download_mnist.bat
    │       ├── download_mnist.sh
    │       ├── src
    │       │   ├── blob.h
    │       │   ├── helper.h
    │       │   ├── layer.cu
    │       │   ├── layer.h
    │       │   ├── loss.cu
    │       │   ├── loss.h
    │       │   ├── mnist.cpp
    │       │   ├── mnist.h
    │       │   ├── network.cpp
    │       │   └── network.h
    │       └── train.cpp
    │   ├── 03_rnn
    │       ├── Makefile
    │       └── rnn.cpp
    │   ├── 04_framework_profile
    │       ├── pytorch
    │       │   ├── README.md
    │       │   ├── RN50v1.5
    │       │   │   ├── GPU_1.log
    │       │   │   ├── README.md
    │       │   │   ├── examples
    │       │   │   │   ├── RN50_FP16_1GPU.sh
    │       │   │   │   ├── RN50_FP16_4GPU.sh
    │       │   │   │   ├── RN50_FP16_8GPU.sh
    │       │   │   │   ├── RN50_FP16_EVAL.sh
    │       │   │   │   ├── RN50_FP16_INFERENCE_BENCHMARK.sh
    │       │   │   │   ├── RN50_FP32_1GPU.sh
    │       │   │   │   ├── RN50_FP32_4GPU.sh
    │       │   │   │   ├── RN50_FP32_8GPU.sh
    │       │   │   │   ├── RN50_FP32_EVAL.sh
    │       │   │   │   └── RN50_FP32_INFERENCE_BENCHMARK.sh
    │       │   │   ├── image_classification
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── dataloaders.py
    │       │   │   │   ├── logger.py
    │       │   │   │   ├── mixup.py
    │       │   │   │   ├── resnet.py
    │       │   │   │   ├── smoothing.py
    │       │   │   │   ├── training.py
    │       │   │   │   └── utils.py
    │       │   │   ├── img
    │       │   │   │   ├── .gitkeep
    │       │   │   │   ├── DGX2_250_loss.png
    │       │   │   │   ├── DGX2_250_top1.png
    │       │   │   │   ├── DGX2_250_top5.png
    │       │   │   │   ├── training_accuracy.png
    │       │   │   │   ├── training_loss.png
    │       │   │   │   └── validation_accuracy.png
    │       │   │   ├── main.py
    │       │   │   ├── multiproc.py
    │       │   │   ├── resnet50_pyt.qdrep
    │       │   │   ├── resnet50_pyt_2g.qdrep
    │       │   │   ├── resnet50v1.5
    │       │   │   │   ├── README.md
    │       │   │   │   └── training
    │       │   │   │   │   ├── DGX1_RN50_FP16_250E.sh
    │       │   │   │   │   ├── DGX1_RN50_FP16_50E.sh
    │       │   │   │   │   ├── DGX1_RN50_FP16_90E.sh
    │       │   │   │   │   ├── DGX1_RN50_FP32_250E.sh
    │       │   │   │   │   ├── DGX1_RN50_FP32_50E.sh
    │       │   │   │   │   ├── DGX1_RN50_FP32_90E.sh
    │       │   │   │   │   ├── DGX2_RN50_FP16_250E.sh
    │       │   │   │   │   ├── DGX2_RN50_FP16_50E.sh
    │       │   │   │   │   ├── DGX2_RN50_FP16_90E.sh
    │       │   │   │   │   ├── DGX2_RN50_FP32_250E.sh
    │       │   │   │   │   ├── DGX2_RN50_FP32_50E.sh
    │       │   │   │   │   └── DGX2_RN50_FP32_90E.sh
    │       │   │   └── test.qdrep
    │       │   └── nsys-nvtx.sh
    │       └── tensorflow
    │       │   ├── RN50v1.5
    │       │       ├── .gitignore
    │       │       ├── .style.yapf
    │       │       ├── README.md
    │       │       ├── baseline.qdrep
    │       │       ├── dllogger
    │       │       │   ├── __init__.py
    │       │       │   ├── autologging.py
    │       │       │   ├── logger.py
    │       │       │   └── tags.py
    │       │       ├── main.py
    │       │       ├── model
    │       │       │   ├── __init__.py
    │       │       │   ├── blocks
    │       │       │   │   ├── __init__.py
    │       │       │   │   ├── conv2d_block.py
    │       │       │   │   └── resnet_bottleneck_block.py
    │       │       │   ├── layers
    │       │       │   │   ├── __init__.py
    │       │       │   │   ├── activation.py
    │       │       │   │   ├── conv2d.py
    │       │       │   │   ├── dense.py
    │       │       │   │   ├── math_ops.py
    │       │       │   │   ├── normalization.py
    │       │       │   │   ├── padding.py
    │       │       │   │   └── pooling.py
    │       │       │   └── resnet_v1_5.py
    │       │       ├── requirements.txt
    │       │       ├── results
    │       │       │   ├── events.out.tfevents.1566195554.5b8c84c05f4e
    │       │       │   ├── model.ckpt-1000.index
    │       │       │   └── model.ckpt-2000.data-00001-of-00002
    │       │       ├── runtime
    │       │       │   ├── __init__.py
    │       │       │   ├── runner.py
    │       │       │   └── runner_utils.py
    │       │       ├── scripts
    │       │       │   ├── RN50_FP16_16GPU.sh
    │       │       │   ├── RN50_FP16_1GPU.sh
    │       │       │   ├── RN50_FP16_4GPU.sh
    │       │       │   ├── RN50_FP16_8GPU.sh
    │       │       │   ├── RN50_FP16_EVAL.sh
    │       │       │   ├── RN50_FP32_16GPU.sh
    │       │       │   ├── RN50_FP32_1GPU.sh
    │       │       │   ├── RN50_FP32_4GPU.sh
    │       │       │   ├── RN50_FP32_8GPU.sh
    │       │       │   ├── RN50_FP32_EVAL.sh
    │       │       │   ├── benchmarking
    │       │       │   │   ├── DGX1V_inferbench_fp16.sh
    │       │       │   │   ├── DGX1V_inferbench_fp32.sh
    │       │       │   │   ├── DGX1V_trainbench_fp16.sh
    │       │       │   │   ├── DGX1V_trainbench_fp32.sh
    │       │       │   │   ├── DGX2_inferbench_fp16.sh
    │       │       │   │   ├── DGX2_inferbench_fp32.sh
    │       │       │   │   ├── DGX2_trainbench_fp16.sh
    │       │       │   │   ├── DGX2_trainbench_fp32.sh
    │       │       │   │   ├── baselines
    │       │       │   │   │   ├── DGX1V_RN50_tensorflow_infer_fp16.json
    │       │       │   │   │   ├── DGX1V_RN50_tensorflow_infer_fp32.json
    │       │       │   │   │   ├── DGX1V_RN50_tensorflow_train_fp16.json
    │       │       │   │   │   ├── DGX1V_RN50_tensorflow_train_fp32.json
    │       │       │   │   │   ├── DGX2_RN50_tensorflow_infer_fp16.json
    │       │       │   │   │   ├── DGX2_RN50_tensorflow_infer_fp32.json
    │       │       │   │   │   ├── DGX2_RN50_tensorflow_train_fp16.json
    │       │       │   │   │   └── DGX2_RN50_tensorflow_train_fp32.json
    │       │       │   │   └── benchmark.py
    │       │       │   └── docker
    │       │       │   │   ├── build.sh
    │       │       │   │   └── interactive.sh
    │       │       └── utils
    │       │       │   ├── __init__.py
    │       │       │   ├── cmdline_helper.py
    │       │       │   ├── dali_utils.py
    │       │       │   ├── data_utils.py
    │       │       │   ├── hooks
    │       │       │       ├── __init__.py
    │       │       │       ├── benchmark_hooks.py
    │       │       │       ├── prefill_hook.py
    │       │       │       └── training_hooks.py
    │       │       │   ├── hvd_utils.py
    │       │       │   ├── image_processing.py
    │       │       │   ├── learning_rate.py
    │       │       │   ├── optimizers.py
    │       │       │   └── var_storage.py
    │       │   ├── nsys-nvtx-2g.sh
    │       │   └── nsys-nvtx.sh
    │   └── Makefile
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore all
 2 | *
 3 | 
 4 | # Unignore all with extensions
 5 | !*.*
 6 | 
 7 | # Unignore all dirs
 8 | !*/
 9 | 
10 | ### Above combination will ignore all files without extension ###
11 | 
12 | *.o
13 | *.nvvp
14 | *.pyc
15 | .DS_Store
16 | __pycache__
17 | 
18 | !Makefile
19 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/01_hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=hello_world
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | hello_world: hello_world.cu
19 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
20 | 
21 | clean:
22 | 	rm -f ${TARGET}
23 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/01_hello_world/hello_world.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h> 
 3 | 
 4 | __global__ void print_from_gpu(void) {
 5 | 	printf("Hello World! from thread [%d,%d] \
 6 | 		From device\n", threadIdx.x,blockIdx.x); 
 7 | }
 8 | 
 9 | int main(void) { 
10 | 	printf("Hello World from host!\n"); 
11 | 	print_from_gpu<<<1,1>>>();
12 | 	cudaDeviceSynchronize();
13 | return 0; 
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/02_vector_addition/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=vector_addition vector_addition_blocks vector_addition_threads vector_addition_threads_blocks
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | vector_addition: vector_addition.cu
21 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
22 | 
23 | vector_addition_blocks: vector_addition_gpu_block_only.cu
24 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
25 | 
26 | vector_addition_threads: vector_addition_gpu_thread_only.cu
27 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
28 | 
29 | vector_addition_threads_blocks: vector_addition_gpu_thread_block.cu
30 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
31 | 
32 | clean:
33 | 	rm -f ${TARGET} 
34 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/02_vector_addition/vector_addition.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | 
 4 | #define N 512
 5 | 
 6 | void host_add(int *a, int *b, int *c) {
 7 | 	for(int idx=0;idx<N;idx++)
 8 | 		c[idx] = a[idx] + b[idx];
 9 | }
10 | 
11 | //basically just fills the array with index.
12 | void fill_array(int *data) {
13 | 	for(int idx=0;idx<N;idx++)
14 | 		data[idx] = idx;
15 | }
16 | 
17 | void print_output(int *a, int *b, int*c) {
18 | 	for(int idx=0;idx<N;idx++)
19 | 		printf("\n %d + %d  = %d",  a[idx] , b[idx], c[idx]);
20 | }
21 | int main(void) {
22 | 	int *a, *b, *c;
23 | 	int size = N * sizeof(int);
24 | 
25 | 	// Alloc space for host copies of a, b, c and setup input values
26 | 	a = (int *)malloc(size); fill_array(a);
27 | 	b = (int *)malloc(size); fill_array(b);
28 | 	c = (int *)malloc(size);
29 | 
30 | 	host_add(a,b,c);
31 | 
32 | 	print_output(a,b,c);
33 | 
34 | 	free(a); free(b); free(c);
35 | 
36 | 
37 | 	return 0;
38 | }
39 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/02_vector_addition/vector_addition_gpu_block_only.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | 
 4 | #define N 512
 5 | 
 6 | void host_add(int *a, int *b, int *c) {
 7 | 	for(int idx=0;idx<N;idx++)
 8 | 		c[idx] = a[idx] + b[idx];
 9 | }
10 | 
11 | __global__ void device_add(int *a, int *b, int *c) {
12 | 
13 |         c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
14 | }
15 | 
16 | 
17 | //basically just fills the array with index.
18 | void fill_array(int *data) {
19 | 	for(int idx=0;idx<N;idx++)
20 | 		data[idx] = idx;
21 | }
22 | 
23 | void print_output(int *a, int *b, int*c) {
24 | 	for(int idx=0;idx<N;idx++)
25 | 		printf("\n %d + %d  = %d",  a[idx] , b[idx], c[idx]);
26 | }
27 | int main(void) {
28 | 	int *a, *b, *c;
29 |         int *d_a, *d_b, *d_c; // device copies of a, b, c
30 | 
31 | 	int size = N * sizeof(int);
32 | 
33 | 	// Alloc space for host copies of a, b, c and setup input values
34 | 	a = (int *)malloc(size); fill_array(a);
35 | 	b = (int *)malloc(size); fill_array(b);
36 | 	c = (int *)malloc(size);
37 | 
38 |         // Alloc space for device copies of a, b, c
39 |         cudaMalloc((void **)&d_a, size);
40 |         cudaMalloc((void **)&d_b, size);
41 |         cudaMalloc((void **)&d_c, size);
42 | 
43 |        // Copy inputs to device
44 |         cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
45 |         cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
46 | 
47 | 
48 | 	device_add<<<N,1>>>(d_a,d_b,d_c);
49 | 
50 |         // Copy result back to host
51 |         cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
52 | 
53 | 	print_output(a,b,c);
54 | 
55 | 	free(a); free(b); free(c);
56 |         cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
57 | 
58 | 
59 | 
60 | 	return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/02_vector_addition/vector_addition_gpu_thread_only.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | 
 4 | #define N 512
 5 | 
 6 | void host_add(int *a, int *b, int *c) {
 7 | 	for(int idx=0;idx<N;idx++)
 8 | 		c[idx] = a[idx] + b[idx];
 9 | }
10 | 
11 | __global__ void device_add(int *a, int *b, int *c) {
12 | 
13 |         c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
14 | }
15 | 
16 | 
17 | //basically just fills the array with index.
18 | void fill_array(int *data) {
19 | 	for(int idx=0;idx<N;idx++)
20 | 		data[idx] = idx;
21 | }
22 | 
23 | void print_output(int *a, int *b, int*c) {
24 | 	for(int idx=0;idx<N;idx++)
25 | 		printf("\n %d + %d  = %d",  a[idx] , b[idx], c[idx]);
26 | }
27 | int main(void) {
28 | 	int *a, *b, *c;
29 |         int *d_a, *d_b, *d_c; // device copies of a, b, c
30 | 
31 | 	int size = N * sizeof(int);
32 | 
33 | 	// Alloc space for host copies of a, b, c and setup input values
34 | 	a = (int *)malloc(size); fill_array(a);
35 | 	b = (int *)malloc(size); fill_array(b);
36 | 	c = (int *)malloc(size);
37 | 
38 |         // Alloc space for device copies of a, b, c
39 |         cudaMalloc((void **)&d_a, size);
40 |         cudaMalloc((void **)&d_b, size);
41 |         cudaMalloc((void **)&d_c, size);
42 | 
43 |        // Copy inputs to device
44 |         cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
45 |         cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
46 | 
47 | 
48 | 	device_add<<<1,N>>>(d_a,d_b,d_c);
49 | 
50 |         // Copy result back to host
51 |         cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
52 | 
53 | 	print_output(a,b,c);
54 | 
55 | 	free(a); free(b); free(c);
56 |         cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
57 | 
58 | 
59 | 
60 | 	return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/Chapter01/01_cuda_introduction/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | .PHONY: clean
23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
24 | 
25 | test:
26 | 	echo $(DIRECTORY)
27 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/01_sgemm/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | sgemm: sgemm.cu
21 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
22 | 
23 | clean:
24 | 	rm -f ${TARGET} 
25 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/02_vector_addition/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=vector_addition_gpu_thread_block
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | vector_addition_gpu_thread_block: vector_addition_gpu_thread_block.cu
21 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
22 | 
23 | clean:
24 | 	rm -f ${TARGET} 
25 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/03_aos_soa/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=aos soa
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | aos: aos.cu
21 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
22 | 
23 | soa: soa.cu
24 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
25 | 
26 | clean:
27 | 	rm -f ${TARGET} 
28 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/03_aos_soa/aos.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<iostream>
 3 | #include<stdlib.h>
 4 | #include<string.h>
 5 | 
 6 | 
 7 | #define NUM_THREADS 256
 8 | 
 9 | #define IMG_SIZE 1048576
10 | 
11 | // Coefficients with Array of Structure
12 | struct Coefficients_AOS {
13 |   int r;
14 |   int b;
15 |   int g;
16 |   int hue;
17 |   int saturation;
18 |   int maxVal;
19 |   int minVal;
20 |   int finalVal; 
21 | };
22 | 
23 | 
24 | __global__
25 | void complicatedCalculation(Coefficients_AOS*  data)
26 | {
27 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
28 | 
29 | 
30 |   int grayscale = (data[i].r + data[i].g + data[i].b)/data[i].maxVal;
31 |   int hue_sat = data[i].hue * data[i].saturation / data[i].minVal;
32 |   data[i].finalVal = grayscale*hue_sat; 
33 | }
34 | 
35 | void complicatedCalculation()
36 | {
37 | 
38 |   Coefficients_AOS* d_x;
39 | 
40 |   cudaMalloc(&d_x, IMG_SIZE*sizeof(Coefficients_AOS)); 
41 | 
42 |   int num_blocks = IMG_SIZE/NUM_THREADS;
43 | 
44 |   complicatedCalculation<<<num_blocks,NUM_THREADS>>>(d_x);
45 | 
46 |   cudaFree(d_x);
47 | }
48 | 
49 | 
50 | 
51 | int main(int argc, char*argv[])
52 | {
53 | 
54 | 	complicatedCalculation();
55 | 	return 0;
56 | }
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/03_aos_soa/soa.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<iostream>
 3 | #include<stdlib.h>
 4 | #include<string.h>
 5 | 
 6 | 
 7 | #define NUM_THREADS 256
 8 | 
 9 | #define IMG_SIZE 1048576
10 | 
11 | // Coefficients with Structure of Array
12 | struct Coefficients_SOA {
13 |   int* r;
14 |   int* b;
15 |   int* g;
16 |   int* hue;
17 |   int* saturation;
18 |   int* maxVal;
19 |   int* minVal;
20 |   int* finalVal; 
21 | };
22 | 
23 | 
24 | __global__
25 | void complicatedCalculation(Coefficients_SOA  data)
26 | {
27 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
28 |   int grayscale = (data.r[i] + data.g[i] + data.b[i])/data.maxVal[i];
29 |   int hue_sat = data.hue[i] * data.saturation[i] / data.minVal[i];
30 | 
31 |   data.finalVal[i] = grayscale*hue_sat; 
32 | }
33 | 
34 | void complicatedCalculation()
35 | {
36 | 
37 |   Coefficients_SOA d_x;
38 | 
39 |   cudaMalloc(&d_x.r, IMG_SIZE*sizeof(int)); 
40 |   cudaMalloc(&d_x.g, IMG_SIZE*sizeof(int)); 
41 |   cudaMalloc(&d_x.b, IMG_SIZE*sizeof(int)); 
42 |   cudaMalloc(&d_x.hue, IMG_SIZE*sizeof(int)); 
43 |   cudaMalloc(&d_x.saturation, IMG_SIZE*sizeof(int)); 
44 |   cudaMalloc(&d_x.maxVal, IMG_SIZE*sizeof(int)); 
45 |   cudaMalloc(&d_x.minVal, IMG_SIZE*sizeof(int)); 
46 |   cudaMalloc(&d_x.finalVal, IMG_SIZE*sizeof(int)); 
47 | 
48 |   int num_blocks = IMG_SIZE/NUM_THREADS;
49 | 
50 |   complicatedCalculation<<<num_blocks,NUM_THREADS>>>(d_x);
51 |   
52 |   cudaFree(d_x.r);
53 |   cudaFree(d_x.g);
54 |   cudaFree(d_x.b);
55 |   cudaFree(d_x.hue);
56 |   cudaFree(d_x.saturation);
57 |   cudaFree(d_x.maxVal);
58 |   cudaFree(d_x.maxVal);
59 |   cudaFree(d_x.minVal);
60 |   cudaFree(d_x.finalVal);
61 | }
62 | 
63 | 
64 | 
65 | int main(int argc, char*argv[])
66 | {
67 | 
68 | 	complicatedCalculation();
69 | 	return 0;
70 | }
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/04_matrix_transpose/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=matrix_transpose conflict_solved
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | matrix_transpose: matrix_transpose.cu
21 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
22 | 
23 | conflict_solved: conflict_solved.cu
24 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
25 | 
26 | clean:
27 | 	rm -f ${TARGET} 
28 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/05_image_scaling/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=image_scaling
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | INCS = scrImagePgmPpmPackage.h
21 | 
22 | scrImagePgmPpmPackage.o: scrImagePgmPpmPackage.cpp ${INCS}
23 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
24 | 
25 | image_scaling.o: image_scaling.cu ${INCS}
26 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
27 | 
28 | image_scaling: scrImagePgmPpmPackage.o image_scaling.o
29 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
30 | 
31 | clean:
32 | 	rm -f ${TARGET} 
33 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/05_image_scaling/aerosmith-double.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter02/02_memory_overview/05_image_scaling/aerosmith-double.pgm


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/05_image_scaling/scrImagePgmPpmPackage.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | 
 4 | 
 5 |  int scr_read_pgm( char* name, unsigned char* image, int irows, int icols );
 6 |  void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment );
 7 |  int scr_read_ppm( char* name, unsigned char* image, int irows, int icols );
 8 |  void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment );
 9 |  void get_PgmPpmParams(char * , int *, int *);
10 |  void getout_comment(FILE * );
11 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/05_image_scaling/voyager2.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter02/02_memory_overview/05_image_scaling/voyager2.pgm


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/06_unified_memory/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=unified_simple unified_initialized unified_prefetch unified_64align
 5 | 
 6 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all : ${TARGET}
19 | 
20 | unified_simple: unified_memory.cu
21 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $<
22 | 
23 | unified_initialized: unified_memory_initialized.cu
24 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $<
25 | 
26 | unified_prefetch: unified_memory_prefetch.cu
27 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $<
28 | 
29 | unified_64align: unified_memory_64align.cu
30 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@.out $<
31 | 
32 | clean:
33 | 	rm -f *.out 
34 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/06_unified_memory/unified_memory.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <math.h>
 3 |  
 4 | // CUDA kernel to add elements of two arrays
 5 | __global__
 6 | void add(int n, float *x, float *y)
 7 | {
 8 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 9 |     int stride = blockDim.x * gridDim.x;
10 |     for (int i = index; i < n; i += stride)
11 |         y[i] = x[i] + y[i];
12 | }
13 |  
14 | int main(void)
15 | {
16 |     int N = 1<<20;
17 |     float *x, *y;
18 | 
19 |     // Allocate Unified Memory -- accessible from CPU or GPU
20 |     cudaMallocManaged(&x, N*sizeof(float));
21 |     cudaMallocManaged(&y, N*sizeof(float));
22 | 
23 |     // initialize x and y arrays on the host
24 |     for (int i = 0; i < N; i++) {
25 |         x[i] = 1.0f;
26 |         y[i] = 2.0f;
27 |     }
28 | 
29 |     // Launch kernel on 1M elements on the GPU
30 |     int blockSize = 256;
31 |     int numBlocks = (N + blockSize - 1) / blockSize;
32 |     add<<<numBlocks, blockSize>>>(N, x, y);
33 | 
34 |     // Wait for GPU to finish before accessing on host
35 |     cudaDeviceSynchronize();
36 | 
37 |     // Check for errors (all values should be 3.0f)
38 |     float maxError = 0.0f;
39 |     for (int i = 0; i < N; i++)
40 |         maxError = fmax(maxError, fabs(y[i]-3.0f));
41 |     std::cout << "Max error: " << maxError << std::endl;
42 | 
43 |     // Free memory
44 |     cudaFree(x);
45 |     cudaFree(y);
46 | 
47 |     return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/06_unified_memory/unified_memory_initialized.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <math.h>
 3 | 
 4 | __global__ void init(int n, float *x, float *y) {
 5 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 6 |     int stride = blockDim.x * gridDim.x;
 7 |     for (int i = index; i < n; i += stride) {
 8 |         x[i] = 1.0f;
 9 |         y[i] = 2.0f;
10 |     }
11 | }
12 |  
13 | // CUDA kernel to add elements of two arrays
14 | __global__
15 | void add(int n, float *x, float *y)
16 | {
17 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
18 |     int stride = blockDim.x * gridDim.x;
19 |     for (int i = index; i < n; i += stride)
20 |         y[i] = x[i] + y[i];
21 | }
22 |  
23 | int main(void)
24 | {
25 |     int N = 1<<20;
26 |     float *x, *y;
27 | 
28 |     // Allocate Unified Memory -- accessible from CPU or GPU
29 |     cudaMallocManaged(&x, N*sizeof(float));
30 |     cudaMallocManaged(&y, N*sizeof(float));
31 | 
32 |     // Launch kernel on 1M elements on the GPU
33 |     int blockSize = 256;
34 |     int numBlocks = (N + blockSize - 1) / blockSize;
35 |     init<<<numBlocks, blockSize>>>(N, x, y);
36 |     add<<<numBlocks, blockSize>>>(N, x, y);
37 | 
38 |     // Wait for GPU to finish before accessing on host
39 |     cudaDeviceSynchronize();
40 | 
41 |     // Check for errors (all values should be 3.0f)
42 |     float maxError = 0.0f;
43 |     for (int i = 0; i < N; i++)
44 |         maxError = fmax(maxError, fabs(y[i]-3.0f));
45 |     std::cout << "Max error: " << maxError << std::endl;
46 | 
47 |     // Free memory
48 |     cudaFree(x);
49 |     cudaFree(y);
50 | 
51 |     return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/06_unified_memory/unified_memory_prefetch.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <math.h>
 3 |  
 4 | // CUDA kernel to add elements of two arrays
 5 | __global__
 6 | void add(int n, float *x, float *y)
 7 | {
 8 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 9 |     int stride = blockDim.x * gridDim.x;
10 |     for (int i = index; i < n; i += stride)
11 |         y[i] = x[i] + y[i];
12 | }
13 |  
14 | int main(void)
15 | {
16 |     int N = 1<<20;
17 |     float *x, *y;
18 |     int device = -1;
19 | 
20 |     // Allocate Unified Memory -- accessible from CPU or GPU
21 |     cudaMallocManaged(&x, N*sizeof(float));
22 |     cudaMallocManaged(&y, N*sizeof(float));
23 | 
24 |     // initialize x and y arrays on the host
25 |     for (int i = 0; i < N; i++) {
26 |         x[i] = 1.0f;
27 |         y[i] = 2.0f;
28 |     }
29 | 
30 |     cudaGetDevice(&device);
31 |     // GPU prefetches unified memory memory
32 |     cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
33 |     cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);
34 | 
35 |     // Launch kernel on 1M elements on the GPU
36 |     int blockSize = 256;
37 |     int numBlocks = (N + blockSize - 1) / blockSize;
38 |     add<<<numBlocks, blockSize>>>(N, x, y);
39 |     // Host prefecthes Memory
40 |     cudaMemPrefetchAsync(y, N*sizeof(float), cudaCpuDeviceId, NULL);
41 |     // Wait for GPU to finish before accessing on host
42 |     cudaDeviceSynchronize();
43 | 
44 |     // Check for errors (all values should be 3.0f)
45 |     float maxError = 0.0f;
46 |     for (int i = 0; i < N; i++)
47 |         maxError = fmax(maxError, fabs(y[i]-3.0f));
48 |     std::cout << "Max error: " << maxError << std::endl;
49 | 
50 |     // Free memory
51 |     cudaFree(x);
52 |     cudaFree(y);
53 | 
54 |     return 0;
55 | }
56 | 


--------------------------------------------------------------------------------
/Chapter02/02_memory_overview/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | .PHONY: clean
23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
24 | 
25 | test:
26 | 	echo $(DIRECTORY)
27 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/01_warp_and_thread_block/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | 
 5 | INCLUDES= -I${CUDA_PATH}/samples/common/inc
 6 | NVCC_FLAGS=-m64 -lineinfo
 7 | 
 8 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
 9 | 
10 | # Gencode argumentes
11 | SMS = 35 37 50 52 60 61 70 75
12 | ifeq "$(IS_CUDA_11)" "1"
13 | SMS = 52 60 61 70 75 80
14 | endif
15 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
16 | 
17 | cuda_thread_block: cuda_thread_block.cu
18 | 	${NVCC} ${INCLUDES} ${ALL_CCFLAGS} ${GENCODE_FLAGS} -o $@ $<
19 | 
20 | clean:
21 | 	rm -f cuda_thread_block
22 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/01_warp_and_thread_block/cuda_thread_block.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | /**
 5 |  * In this section, we will discover concurrent operation in CUDA
 6 |  *  1) blocks in grid: concurrent tasks, no gurantee their order of execution (no synchronization)
 7 |  *  2) warp in blocks: concurrent threads, explicitly synchronizable (it will be discussed in next section)
 8 |  *  3) thread in warp: implicitly synchronized
 9 |  */
10 | 
11 | __global__ void idx_print()
12 | {
13 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
14 |     int warp_idx = threadIdx.x / warpSize;
15 |     int lane_idx = threadIdx.x & (warpSize - 1);
16 |     
17 |     if ((lane_idx & (warpSize/2 - 1)) == 0)
18 |         //  thread, block, warp, lane"
19 |         printf(" %5d\t%5d\t %2d\t%2d\n", idx, blockIdx.x, warp_idx, lane_idx);
20 | }
21 | 
22 | int main(int argc, char* argv[])
23 | {
24 |     if (argc == 1) {
25 |         puts("Please put Block Size and Thread Block Size..");
26 |         puts("./cuda_thread_block [grid size] [block size]");
27 |         puts("e.g.) ./cuda_thread_block 4 128");
28 | 
29 |         exit(1);
30 |     }
31 | 
32 |     int gridSize = atoi(argv[1]);
33 |     int blockSize = atoi(argv[2]);
34 | 
35 |     puts("thread, block, warp, lane");
36 |     idx_print<<<gridSize, blockSize>>>();
37 |     cudaDeviceSynchronize();
38 | }
39 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/02_cuda_occupancy/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all: ${TARGET}
19 | 
20 | # SEGMM
21 | sgemm: sgemm.cu
22 | 	$(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
23 | 
24 | clean:
25 | 	rm -f ${TARGET} *.o
26 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction_global reduction_shared
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += 
20 | ALL_CCFLAGS += 
21 | 
22 | all : ${TARGET}
23 | 
24 | 
25 | reduction_global_kernel.o: reduction_global_kernel.cu
26 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
27 | 
28 | reduction_global.o: reduction_global.cpp
29 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
30 | 
31 | reduction_global: reduction_global.o reduction_global_kernel.o
32 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
33 | 
34 | reduction_shared_kernel.o: reduction_shared_kernel.cu
35 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
36 | 
37 | reduction_shared.o: reduction_shared.cpp
38 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
39 | 
40 | reduction_shared: reduction_shared.o reduction_shared_kernel.o
41 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
42 | 
43 | clean:
44 | 	rm -f ${TARGET} *.o *.nvvp
45 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/reduction.h:
--------------------------------------------------------------------------------
 1 | #ifndef _REDUCTION_H_
 2 | #define _REDUCTION_H_
 3 | 
 4 | // @reduction_kernel.cu
 5 | void reduction(float *d_out, float *d_in, int n_threads, int size);
 6 | 
 7 | // @naive_reduction_kernel.cu
 8 | void global_reduction(float *d_out, float *d_in, int n_threads, int size);
 9 | // void atomic_reduction(float *d_out, float *d_in, int n_threads, int size);
10 | 
11 | #endif // _REDUCTION_H_
12 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/reduction_global_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | __global__ void
 5 | global_reduction_kernel(float *data_out, float *data_in, int stride, int size)
 6 | {
 7 |     int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
 8 | 
 9 |     if (idx_x + stride < size) {
10 |         data_out[idx_x] += data_in[idx_x + stride];
11 |     }
12 | }
13 | 
14 | void global_reduction(float *d_out, float *d_in, int n_threads, int size)
15 | {
16 |     int n_blocks = (size + n_threads - 1) / n_threads;
17 |     for (int stride = 1; stride < size; stride *= 2) {
18 |         global_reduction_kernel<<<n_blocks, n_threads>>>(d_out, d_in, stride, size);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/03_threadsync_and_reduction/reduction_shared_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "reduction.h"
 3 | 
 4 | /*
 5 |     Parallel sum reduction using shared memory
 6 |     - takes log(n) steps for n input elements
 7 |     - uses n threads
 8 |     - only works for power-of-2 arrays
 9 | */
10 | 
11 | // cuda thread synchronization
12 | __global__ void
13 | reduction_kernel(float* d_out, float* d_in, unsigned int size)
14 | {
15 |     unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
16 | 
17 |     extern __shared__ float s_data[];
18 | 
19 |     s_data[threadIdx.x] = (idx_x < size) ? d_in[idx_x] : 0.f;
20 | 
21 |     __syncthreads();
22 | 
23 |     // do reduction
24 |     for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
25 |     {
26 |         // thread synchronous reduction
27 |         if ( (idx_x % (stride * 2)) == 0 )
28 |             s_data[threadIdx.x] += s_data[threadIdx.x + stride];
29 | 
30 |         __syncthreads();
31 |     }
32 | 
33 |     if (threadIdx.x == 0)
34 |         d_out[blockIdx.x] = s_data[0];
35 | }
36 | 
37 | void reduction(float *d_out, float *d_in, int n_threads, int size)
38 | {   
39 |     cudaMemcpy(d_out, d_in, size * sizeof(float), cudaMemcpyDeviceToDevice);
40 |     while(size > 1)
41 |     {
42 |         int n_blocks = (size + n_threads - 1) / n_threads;
43 |         reduction_kernel<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(d_out, d_out, size);
44 |         size = n_blocks;
45 |     } 
46 | }


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/04_performance_limiter/Makefile:
--------------------------------------------------------------------------------
 1 | # CUDA_PATH=${CUDA_ROOT}
 2 | CUDA_PATH=/usr/local/cuda
 3 | HOST_COMPILER ?= g++
 4 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 5 | TARGET=sgemm
 6 | 
 7 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 8 | NVCC_FLAGS=-m64 -lineinfo
 9 | 
10 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
11 | 
12 | # Gencode argumentes
13 | SMS = 35 37 50 52 60 61 70 75
14 | ifeq "$(IS_CUDA_11)" "1"
15 | SMS = 52 60 61 70 75 80
16 | endif
17 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
18 | 
19 | ALL_CCFLAGS += $(NVCC_FLAGS)
20 | 
21 | all: ${TARGET}
22 | 
23 | # SEGMM
24 | sgemm: sgemm.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
26 | 
27 | clean:
28 | 	rm -f ${TARGET} *.o
29 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/04_performance_limiter/reduction.h:
--------------------------------------------------------------------------------
 1 | #ifndef _REDUCTION_H_
 2 | #define _REDUCTION_H_
 3 | 
 4 | // @reduction_kernel.cu
 5 | void reduction(float *d_out, float *d_in, int n_threads, int size);
 6 | 
 7 | // @naive_reduction_kernel.cu
 8 | void global_reduction(float *d_out, float *d_in, int n_threads, int size);
 9 | // void atomic_reduction(float *d_out, float *d_in, int n_threads, int size);
10 | 
11 | #endif // _REDUCTION_H_
12 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/04_performance_limiter/reduction_shared_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "reduction.h"
 3 | 
 4 | /*
 5 |     Parallel sum reduction using shared memory
 6 |     - takes log(n) steps for n input elements
 7 |     - uses n threads
 8 |     - only works for power-of-2 arrays
 9 | */
10 | 
11 | // cuda thread synchronization
12 | __global__ void
13 | reduction_kernel(float* d_out, float* d_in, unsigned int size)
14 | {
15 |     unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
16 | 
17 |     extern __shared__ float s_data[];
18 | 
19 |     s_data[threadIdx.x] = (idx_x < size) ? d_in[idx_x] : 0.f;
20 | 
21 |     __syncthreads();
22 | 
23 |     // do reduction
24 |     for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
25 |     {
26 |         // thread synchronous reduction
27 |         // to reduce the compute utilization, we can switch the operation
28 |         // if ( (idx_x % (stride * 2)) == 0 )     // 0.433 ms
29 |         if ( (idx_x & (stride * 2 - 1)) == 0 )    // 0.399 ms
30 |             s_data[threadIdx.x] += s_data[threadIdx.x + stride];
31 | 
32 |         __syncthreads();
33 |     }
34 | 
35 |     if (threadIdx.x == 0)
36 |         d_out[blockIdx.x] = s_data[0];
37 | }
38 | 
39 | void reduction(float *d_out, float *d_in, int n_threads, int size)
40 | {   
41 |     cudaMemcpy(d_out, d_in, size * sizeof(float), cudaMemcpyDeviceToDevice);
42 |     while(size > 1)
43 |     {
44 |         int n_blocks = (size + n_threads - 1) / n_threads;
45 |         reduction_kernel<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(d_out, d_out, size);
46 |         size = n_blocks;
47 |     } 
48 | }


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/05_warp_divergence/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction_sequential reduction_interleaving
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true ${NVCC_FLAGS} 
21 | 
22 | all : ${TARGET}
23 | 
24 | reduction_kernel_interleaving.o: reduction_kernel_interleaving.cu
25 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
26 | 
27 | reduction_kernel_sequential.o: reduction_kernel_sequential.cu
28 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
29 | 
30 | reduction.o: reduction.cpp
31 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
32 | 
33 | reduction_sequential: reduction.o reduction_kernel_sequential.o
34 | 	$(EXEC) $(NVCC) ${INCLUDES} -o $@ $(ALL_CCFLAGS) $(GENCODE_FLAGS) $(LIBRARIES) $+
35 | 
36 | reduction_interleaving: reduction.o reduction_kernel_interleaving.o
37 | 	$(EXEC) $(NVCC) ${INCLUDES} -o $@ $(ALL_CCFLAGS) $(GENCODE_FLAGS) $(LIBRARIES) $+
38 | 
39 | clean:
40 | 	rm -f ${TARGET} *.o
41 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/05_warp_divergence/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @ calling the reduction kernel
5 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads);
6 | 
7 | #define max(a, b) (a) > (b) ? (a) : (b)
8 | 
9 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/05_warp_divergence/reduction_kernel_interleaving.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "reduction.h"
 3 | 
 4 | /*
 5 |     Parallel sum reduction using shared memory
 6 |     - takes log(n) steps for n input elements
 7 |     - uses n threads
 8 |     - only works for power-of-2 arrays
 9 | */
10 | 
11 | // cuda thread synchronization
12 | __global__ void
13 | reduction_kernel_1(float* g_out, float* g_in, unsigned int size)
14 | {
15 |     unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
16 | 
17 |     extern __shared__ float s_data[];
18 | 
19 |     s_data[threadIdx.x] = (idx_x < size) ? g_in[idx_x] : 0.f;
20 | 
21 |     __syncthreads();
22 | 
23 |     // do reduction
24 |     // interleaved addressing
25 |     for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
26 |     {
27 |         int index = 2 * stride * threadIdx.x;
28 |         
29 |         if (index < blockDim.x)
30 |             s_data[index] += s_data[index + stride];
31 |         
32 |         __syncthreads();
33 |     }
34 | 
35 |     if (threadIdx.x == 0)
36 |         g_out[blockIdx.x] = s_data[0];
37 | }
38 | 
39 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads)
40 | {
41 |     int n_blocks = (size + n_threads - 1) / n_threads;
42 |     reduction_kernel_1<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(g_outPtr, g_inPtr, size);
43 |     return n_blocks;
44 | }
45 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/05_warp_divergence/reduction_kernel_sequential.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "reduction.h"
 3 | 
 4 | /*
 5 |     Parallel sum reduction using shared memory
 6 |     - takes log(n) steps for n input elements
 7 |     - uses n threads
 8 |     - only works for power-of-2 arrays
 9 | */
10 | __global__ void
11 | reduction_kernel_2(float *g_out, float *g_in, unsigned int size)
12 | {
13 |     unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
14 | 
15 |     extern __shared__ float s_data[];
16 | 
17 |     s_data[threadIdx.x] = (idx_x < size) ? g_in[idx_x] : 0.f;
18 | 
19 |     __syncthreads();
20 | 
21 |     // do reduction
22 |     // sequential addressing
23 |     for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1)
24 |     {
25 |         if (threadIdx.x < stride)
26 |             s_data[threadIdx.x] += s_data[threadIdx.x + stride];
27 | 
28 |         __syncthreads();
29 |     }
30 | 
31 |     if (threadIdx.x == 0)
32 |         g_out[blockIdx.x] = s_data[0];
33 | }
34 | 
35 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads)
36 | {
37 |     int n_blocks = (size + n_threads - 1) / n_threads;
38 |     reduction_kernel_2<<< n_blocks, n_threads, n_threads * sizeof(float), 0 >>>(g_outPtr, g_inPtr, size);
39 |     return n_blocks;
40 | }


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/06_limiter_balancing/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true
21 | 
22 | all : ${TARGET}
23 | 
24 | reduction_kernel.o: reduction_kernel.cu
25 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
26 | 
27 | reduction.o: reduction.cpp
28 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
29 | 
30 | reduction: reduction.o reduction_kernel.o
31 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
32 | 
33 | clean:
34 | 	rm -f ${TARGET} *.o
35 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/06_limiter_balancing/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @reduction_kernel.cu
5 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads);
6 | 
7 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/06_limiter_balancing/reduction_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "reduction.h"
 3 | 
 4 | /*
 5 |     Parallel sum reduction using shared memory
 6 |     - takes log(n) steps for n input elements
 7 |     - uses n threads
 8 |     - only works for power-of-2 arrays
 9 | */
10 | __global__ void
11 | reduction_kernel(float *g_out, float *g_in, unsigned int size)
12 | {
13 |     unsigned int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
14 | 
15 |     extern __shared__ float s_data[];
16 | 
17 |     // cumulates input with grid-stride loop and save to share memory
18 |     float input = 0.f;
19 |     for (int i = idx_x; i < size; i += blockDim.x * gridDim.x)
20 |         input += g_in[i];
21 |     s_data[threadIdx.x] = input;
22 | 
23 |     __syncthreads();
24 | 
25 |     // do reduction
26 |     for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1)
27 |     {
28 |         if (threadIdx.x < stride) 
29 |             s_data[threadIdx.x] += s_data[threadIdx.x + stride];
30 | 
31 |         __syncthreads();
32 |     }
33 | 
34 |     if (threadIdx.x == 0) {
35 |         g_out[blockIdx.x] = s_data[0];
36 |     }
37 | }
38 | 
39 | int reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads)
40 | {
41 |     int num_sms;
42 |     int num_blocks_per_sm;
43 |     cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0);
44 |     cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, reduction_kernel, n_threads, n_threads*sizeof(float));
45 |     int n_blocks = min(num_blocks_per_sm * num_sms, (size + n_threads - 1) / n_threads);
46 | 
47 |     reduction_kernel<<<n_blocks, n_threads, n_threads * sizeof(float), 0>>>(g_outPtr, g_inPtr, size);
48 |     reduction_kernel<<<1, n_threads, n_threads * sizeof(float), 0>>>(g_outPtr, g_outPtr, n_blocks);
49 | 
50 |     return 1;
51 | }
52 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/07_warp_synchronous_programming/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction_wp
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS)
21 | 
22 | all : ${TARGET}
23 | 
24 | reduction_wp_kernel.o: reduction_wp_kernel.cu
25 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
26 | 
27 | reduction.o: reduction.cpp
28 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
29 | 
30 | reduction_wp: reduction.o reduction_wp_kernel.o
31 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
32 | 
33 | clean:
34 | 	rm -f ${TARGET} *.o
35 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/07_warp_synchronous_programming/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @reduction_kernel.cu
5 | void reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads);
6 | 
7 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/08_cooperative_group/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction_cg reduction_cg_shift
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true
21 | 
22 | all : ${TARGET}
23 | 
24 | reduction.o: reduction.cpp
25 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
26 | 
27 | reduction_cg_kernel.o: reduction_cg_kernel.cu
28 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
29 | 
30 | reduction_cg: reduction.o reduction_cg_kernel.o
31 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
32 | 
33 | reduction_cg_shift_kernel.o: reduction_cg_shift_kernel.cu
34 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
35 | 
36 | reduction_cg_shift: reduction.o reduction_cg_shift_kernel.o
37 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
38 | 
39 | clean:
40 | 	rm -f ${TARGET} *.o
41 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/08_cooperative_group/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @reduction_kernel.cu
5 | void reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads);
6 | 
7 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/09_loop_unrolling/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction_wp reduction_cg
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo# --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS)
21 | 
22 | all : ${TARGET}
23 | 
24 | reduction_wp_kernel.o: reduction_wp_kernel.cu
25 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
26 | 
27 | reduction_cg_kernel.o: reduction_cg_kernel.cu
28 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
29 | 
30 | reduction.o: reduction.cpp
31 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
32 | 
33 | reduction_wp: reduction.o reduction_wp_kernel.o
34 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
35 | 
36 | reduction_cg: reduction.o reduction_cg_kernel.o
37 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
38 | 
39 | clean:
40 | 	rm -f ${TARGET} *.o
41 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/09_loop_unrolling/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @reduction_kernel.cu
5 | void reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads);
6 | 
7 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/10_atomic_operation/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @reduction_wrp_atmc_kernel.cu
5 | // @reduction_blk_atmc_kernel.cu
6 | // @reduction_kernel.cu
7 | void atomic_reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads);
8 | 
9 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/10_atomic_operation/reduction_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cooperative_groups.h>
 3 | #include "reduction.h"
 4 | 
 5 | using namespace cooperative_groups;
 6 | 
 7 | /*
 8 |     Parallel sum reduction using shared memory
 9 |     - takes log(n) steps for n input elements
10 |     - uses n threads
11 |     - only works for power-of-2 arrays
12 | */
13 | 
14 | /**
15 |     Two warp level primitives are used here for this example
16 |     https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
17 |     https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
18 |  */
19 | 
20 | __global__ void
21 | atomic_reduction_kernel(float *data_out, float *data_in, int size)
22 | {
23 |     int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
24 | 
25 |     atomicAdd(&data_out[0], data_in[idx_x]);
26 | }
27 | 
28 | void atomic_reduction(float *g_outPtr, float *g_inPtr, int size, int n_threads)
29 | {
30 |     int n_blocks = (size + n_threads - 1) / n_threads;
31 |     atomic_reduction_kernel<<<n_blocks, n_threads>>>(g_outPtr, g_inPtr, size);
32 | }
33 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/11_mixed_precision_operation/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=mixed_precision_single mixed_precision_half mixed_precision_int 
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | 
21 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES)
22 | 
23 | all : ${TARGET}
24 | 
25 | mixed_precision_half: mixed_precision_half.cu
26 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | mixed_precision_single: mixed_precision.cu
29 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
30 | 
31 | mixed_precision_int: mixed_precision_int.cu
32 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
33 | 
34 | clean:
35 | 	rm -f ${TARGET} *.o
36 | 


--------------------------------------------------------------------------------
/Chapter03/03_cuda_thread_programming/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | .PHONY: clean
23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
24 | 
25 | test:
26 | 	echo $(DIRECTORY)
27 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/01_cuda_stream/1_cuda_default_stream.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | using namespace std;
 4 | 
 5 | __global__ void
 6 | foo_kernel(int step)
 7 | {
 8 |     printf("loop: %d\n", step);
 9 | }
10 | 
11 | int main()
12 | {
13 |     int n_loop = 5;
14 | 
15 |     // execute kernels with the default stream
16 |     for (int i = 0; i < n_loop; i++)
17 |         foo_kernel<<< 1, 1, 0, 0 >>>(i);
18 | 
19 |     cudaDeviceSynchronize();
20 | 
21 |     return 0;
22 | }


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/01_cuda_stream/2_cuda_multi_stream.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | using namespace std;
 4 | 
 5 | __global__ void
 6 | foo_kernel(int step)
 7 | {
 8 |     printf("loop: %d\n", step);
 9 | }
10 | 
11 | int main()
12 | {
13 |     int n_stream = 5;
14 |     cudaStream_t *ls_stream;
15 |     ls_stream = (cudaStream_t*) new cudaStream_t[n_stream];
16 | 
17 |     // create multiple streams
18 |     for (int i = 0; i < n_stream; i++)
19 |         cudaStreamCreate(&ls_stream[i]);
20 | 
21 |     // execute kernels with the CUDA stream each
22 |     for (int i = 0; i < n_stream; i++)
23 |         foo_kernel<<< 1, 1, 0, ls_stream[i] >>>(i);
24 | 
25 |     // synchronize the host and GPU
26 |     cudaDeviceSynchronize();
27 | 
28 |     // terminates all the created CUDA streams
29 |     for (int i = 0; i < n_stream; i++)
30 |         cudaStreamDestroy(ls_stream[i]);
31 |     delete [] ls_stream;
32 | 
33 |     return 0;
34 | }


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/01_cuda_stream/3_cuda_multi_stream_with_sync.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | using namespace std;
 4 | 
 5 | __global__ void
 6 | foo_kernel(int step)
 7 | {
 8 |     printf("loop: %d\n", step);
 9 | }
10 | 
11 | int main()
12 | {
13 |     int n_stream = 5;
14 |     cudaStream_t *ls_stream;
15 |     ls_stream = (cudaStream_t*) new cudaStream_t[n_stream];
16 | 
17 |     // create multiple streams
18 |     for (int i = 0; i < n_stream; i++)
19 |         cudaStreamCreate(&ls_stream[i]);
20 | 
21 |     // execute kernels with the CUDA stream each
22 |     for (int i = 0; i < n_stream; i++) {
23 |        foo_kernel<<< 1, 1, 0, ls_stream[i] >>>(i);
24 |        cudaStreamSynchronize(ls_stream[i]);
25 |     }
26 | 
27 |     // synchronize the host and GPU
28 |     cudaDeviceSynchronize();
29 | 
30 |     // terminates all the created CUDA streams
31 |     for (int i = 0; i < n_stream; i++)
32 |         cudaStreamDestroy(ls_stream[i]);
33 |     delete [] ls_stream;
34 | 
35 |     return 0;
36 | }


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/01_cuda_stream/4_cuda_multi_stream_with_default.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | using namespace std;
 4 | 
 5 | __global__ void
 6 | foo_kernel(int step)
 7 | {
 8 |     printf("loop: %d\n", step);
 9 | }
10 | 
11 | int main()
12 | {
13 |     int n_stream = 5;
14 |     cudaStream_t *ls_stream;
15 |     ls_stream = (cudaStream_t*) new cudaStream_t[n_stream];
16 | 
17 |     // create multiple streams
18 |     for (int i = 0; i < n_stream; i++)
19 |         cudaStreamCreate(&ls_stream[i]);
20 | 
21 |     // execute kernels with the CUDA stream each
22 |     for (int i = 0; i < n_stream; i++)
23 |         if (i == 3)
24 |             foo_kernel<<< 1, 1, 0, 0 >>>(i);
25 |         else
26 |             foo_kernel<<< 1, 1, 0, ls_stream[i] >>>(i);
27 | 
28 |     // synchronize the host and GPU
29 |     cudaDeviceSynchronize();
30 | 
31 |     // terminates all the created CUDA streams
32 |     for (int i = 0; i < n_stream; i++)
33 |         cudaStreamDestroy(ls_stream[i]);
34 |     delete [] ls_stream;
35 | 
36 |     return 0;
37 | }


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/01_cuda_stream/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cuda_default_stream cuda_multi_stream cuda_multi_stream_with_sync cuda_multi_stream_with_default
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += 
20 | ALL_CCFLAGS += 
21 | 
22 | all : ${TARGET}
23 | 
24 | cuda_default_stream: 1_cuda_default_stream.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES)
26 | 
27 | cuda_multi_stream: 2_cuda_multi_stream.cu
28 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES)
29 | 
30 | cuda_multi_stream_with_sync: 3_cuda_multi_stream_with_sync.cu
31 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES)
32 | 
33 | cuda_multi_stream_with_default: 4_cuda_multi_stream_with_default.cu
34 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $< $(LIBRARIES)
35 | 	
36 | clean:
37 | 	rm -f ${TARGET} *.o *.nvvp
38 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/02_pipelining/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cuda_pipelining
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --default-stream per-thread # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += 
20 | ALL_CCFLAGS += $(NVCC_FLAGS)
21 | 
22 | all : ${TARGET}
23 | 
24 | cuda_pipelining: cuda_pipelining.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
26 | 
27 | nvprof: cuda_pipelining
28 | 	nvprof -f -o $+_${STREAMS}.nvvp ./$+ ${STREAMS}
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/03_cuda_callback/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cuda_callback
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 --default-stream per-thread # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += 
20 | ALL_CCFLAGS += $(NVCC_FLAGS)
21 | 
22 | all : ${TARGET}
23 | 
24 | cuda_callback: cuda_callback.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
26 | 
27 | nvprof: cuda_callback
28 | 	nvprof -f -o $+_${STREAMS}.nvvp --cpu-thread-tracing on ./$+ ${STREAMS}
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/04_stream_priority/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=prioritized_cuda_stream
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 --default-stream per-thread # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += 
20 | ALL_CCFLAGS += $(NVCC_FLAGS)
21 | 
22 | all : ${TARGET}
23 | 
24 | prioritized_cuda_stream: prioritized_cuda_stream.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
26 | 
27 | nvprof: prioritized_cuda_stream
28 | 	nvprof -f -o $+_${STREAMS}.nvvp --cpu-thread-tracing on ./$+ ${STREAMS}
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/05_cuda_event/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cuda_event cuda_event_with_streams
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS= #-m64 --default-stream per-thread # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += $(NVCC_FLAGS) -g -Xcompiler -fopenmp -rdc=true
21 | 
22 | # Openmp
23 | # LIBRARIES += -lgomp
24 | # ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true
25 | 
26 | all : ${TARGET}
27 | 
28 | cuda_event: cuda_event.cu
29 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
30 | 
31 | cuda_event_with_streams: cuda_event_with_streams.cu
32 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
33 | 
34 | clean:
35 | 	rm -f ${TARGET} *.o *.nvvp
36 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/06_dynamic_parallelism/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=dynamic_parallelism recursion
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -rdc=true -lcudadevrt # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += 
19 | ALL_CCFLAGS += ${NVCC_FLAGS}
20 | 
21 | all : ${TARGET}
22 | 
23 | dynamic_parallelism: dynamic_parallelism.cu
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
25 | 
26 | recursion: recursion.cu
27 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
28 | 
29 | nvprof: dynamic_parallelism
30 | 	nvprof -f -o $+.nvvp ./$+
31 | 
32 | clean:
33 | 	rm -f ${TARGET} *.o *.nvvp
34 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/06_dynamic_parallelism/dynamic_parallelism.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <iostream>
 4 | 
 5 | using namespace std;
 6 | 
 7 | #define BUF_SIZE (1 << 10)
 8 | #define BLOCKDIM 256
 9 | 
10 | __global__ void child_kernel(int *data, int seed)
11 | {
12 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 | 
14 |     atomicAdd(&data[idx], seed);
15 | }
16 | 
17 | __global__ void parent_kernel(int *data)
18 | {
19 |     if (threadIdx.x == 0)
20 |     {
21 |         int child_size = BUF_SIZE/gridDim.x;
22 |         child_kernel<<< child_size/BLOCKDIM, BLOCKDIM >>>(&data[child_size*blockIdx.x], blockIdx.x+1);
23 |     }
24 |     // synchronization for other parent's kernel output
25 |     cudaDeviceSynchronize();
26 | }
27 | 
28 | int main()
29 | {
30 |     int *data;
31 |     int num_child = 2;
32 | 
33 |     cudaMallocManaged((void**)&data, BUF_SIZE * sizeof(int));
34 |     cudaMemset(data, 0, BUF_SIZE * sizeof(int));
35 | 
36 |     parent_kernel<<<num_child, 1>>>(data);
37 | 
38 |     cudaDeviceSynchronize();
39 |     
40 |     // Count elements value
41 |     int counter = 0;
42 |     for (int i = 0; i < BUF_SIZE; i++) {
43 |         counter += data[i];
44 |     }
45 | 
46 |     // getting answer
47 |     int counter_h = 0;
48 |     for (int i = 0; i < num_child; i++) {
49 |         counter_h += (i+1);
50 |     }
51 |     counter_h *= BUF_SIZE / num_child;
52 | 
53 |     if (counter_h == counter)
54 |         printf("Correct!!\n");
55 |     else
56 |         printf("Error!! Obtained %d. It should be %d\n", counter, counter_h);
57 | 
58 |     cudaFree(data);
59 | 
60 |     return 0;
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/07_grid_level_cg/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=reduction
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true
21 | 
22 | all : ${TARGET}
23 | 
24 | reduction_kernel.o: reduction_kernel.cu
25 | 	$(EXEC) $(NVCC) $(NVCC_FLAGS) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
26 | 
27 | reduction.o: reduction.cpp
28 | 	$(EXEC) $(NVCC) $(NVCC_FLAGS) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
29 | 
30 | reduction: reduction.o reduction_kernel.o
31 | 	$(EXEC) $(NVCC) $(NVCC_FLAGS) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
32 | 
33 | nvprof: reduction
34 | 	nvprof -f -o $+.nvvp --cpu-thread-tracing on ./$+
35 | 	nvprof -f -o $+-metric.nvvp --analysis-metrics ./$+
36 | 
37 | clean:
38 | 	rm -f ${TARGET} *.o
39 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/07_grid_level_cg/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef _REDUCTION_H_
2 | #define _REDUCTION_H_
3 | 
4 | // @reduction_loop_kernel.cu
5 | int reduction_grid_sync(float *g_outPtr, float *g_inPtr, int size, int n_threads);
6 | 
7 | #endif // _REDUCTION_H_


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/08_openmp_cuda/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=openmp openmp_default_stream openmp_gpus
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true
21 | 
22 | all : ${TARGET}
23 | 
24 | openmp: openmp.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
26 | 
27 | openmp_gpus: openmp_gpus.cu
28 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
29 | 
30 | openmp_default_stream: openmp_default_stream.cu
31 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
32 | 
33 | nvprof: openmp
34 | 	nvprof -f -o $+.nvvp --cpu-thread-tracing on ./$+
35 | 
36 | n_ops: openmp_gpus
37 | 	nvprof -f -o $+_${STREAMS}.nvvp --cpu-thread-tracing on ./$+ ${STREAMS}
38 | 
39 | clean:
40 | 	rm -f ${TARGET} *.o *.nvvp
41 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/09_mps/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | MPICC ?= mpicc
 5 | TARGET=simpleMPI
 6 | 
 7 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I/usr/local/include/
 8 | NVCC_FLAGS=-m64 -Xcompiler -fopenmp -rdc=true -lcudadevrt -lmpi # --resource-usage
 9 | 
10 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
11 | 
12 | # Gencode argumentes
13 | SMS = 35 37 50 52 60 61 70 75
14 | ifeq "$(IS_CUDA_11)" "1"
15 | SMS = 52 60 61 70 75 80
16 | endif
17 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
18 | 
19 | LIBRARIES += -lgomp
20 | ALL_CCFLAGS += ${NVCC_FLAGS}
21 | 
22 | all : ${TARGET}
23 | 
24 | simpleMPI: simpleMPI.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
26 | 
27 | enable_mps:
28 | 	export CUDA_VISIBLE_DEVICES=0
29 | 	sudo nvidia-smi -c 3 -i 0
30 | 	sudo nvidia-cuda-mps-control -d
31 | 
32 | disable_mps:
33 | 	echo "quit" | sudo nvidia-cuda-mps-control
34 | 	sudo nvidia-smi -c 0 -i 0
35 | 
36 | nvprof: simpleMPI
37 | 	mpirun -np ${PROCS} nvprof -f -o $+.%q{OMPI_COMM_WORLD_RANK}_${STREAMS}.nvvp ./$+ ${STREAMS}
38 | 
39 | clean:
40 | 	rm -f ${TARGET} *.o *.nvvp
41 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/09_mps/install_mpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MPI_VERSION="3.0.4"
 3 | 
 4 | wget -O /tmp/openmpi-${MPI_VERSION}.tar.gz https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-${MPI_VERSION}.tar.gz
 5 | tar xzf /tmp/openmpi-${MPI_VERSION}.tar.gz -C /tmp
 6 | cd /tmp/openmpi-${MPI_VERSION}
 7 | ./configure --enable-orterun-prefix-by-default
 8 | make -j $(nproc) all && sudo make install
 9 | sudo ldconfig
10 | mpirun --version


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/10_kernel_execution_overhead/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | MPICC ?= mpicc
 5 | TARGET=cuda_kernel
 6 | 
 7 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 8 | NVCC_FLAGS=-m64 -rdc=true # --resource-usage
 9 | 
10 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
11 | 
12 | # Gencode argumentes
13 | SMS = 35 37 50 52 60 61 70 75
14 | ifeq "$(IS_CUDA_11)" "1"
15 | SMS = 52 60 61 70 75 80
16 | endif
17 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
18 | 
19 | LIBRARIES += 
20 | ALL_CCFLAGS += ${NVCC_FLAGS}
21 | 
22 | all : ${TARGET}
23 | 
24 | cuda_kernel: cuda_kernel.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
26 | 
27 | nvprof: cuda_kernel
28 | 	nvprof -f -o $+.nvvp ./$+
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter04/04_kernel_execution/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
23 | 
24 | test:
25 | 	echo $(DIRECTORY)


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/.gitignore:
--------------------------------------------------------------------------------
1 | /Debug/
2 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/01_focused_profile/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-G
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | ALL_CCFLAGS +=-m64 -g $(NVCC_FLAGS) $(INCLUDES)
19 | 
20 | all : ${TARGET}
21 | 
22 | sgemm: sgemm.cu
23 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
24 | 
25 | clean:
26 | 	rm -f ${TARGET} *.o


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/02_nvtx/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS= -lnvToolsExt
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | ALL_CCFLAGS +=-m64 -g $(NVCC_FLAGS) $(INCLUDES)
19 | 
20 | all : ${TARGET}
21 | 
22 | sgemm: sgemm.cu
23 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
24 | 
25 | nvprof: sgemm
26 | 	nvprof -f --profile-from-start off -o sgemm.nvvp ./sgemm.nvvp
27 | 
28 | clean:
29 | 	rm -f ${TARGET} *.o *.nvvp


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/03_cuda_error/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | 
21 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES)
22 | 
23 | all : ${TARGET}
24 | 
25 | sgemm: sgemm.cu
26 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | clean:
29 | 	rm -f ${TARGET} *.o
30 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/04_cuda_assert/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo -G # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | 
21 | ALL_CCFLAGS += -g -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES)
22 | 
23 | all : ${TARGET}
24 | 
25 | sgemm: sgemm.cu
26 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | clean:
29 | 	rm -f ${TARGET} *.o
30 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/.gitignore:
--------------------------------------------------------------------------------
1 | /Release/
2 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>06_debug_with_eclipse</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
10 | 			<triggers>clean,full,incremental,</triggers>
11 | 			<arguments>
12 | 			</arguments>
13 | 		</buildCommand>
14 | 		<buildCommand>
15 | 			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
16 | 			<triggers>full,incremental,</triggers>
17 | 			<arguments>
18 | 			</arguments>
19 | 		</buildCommand>
20 | 	</buildSpec>
21 | 	<natures>
22 | 		<nature>org.eclipse.cdt.core.cnature</nature>
23 | 		<nature>org.eclipse.cdt.core.ccnature</nature>
24 | 		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
25 | 		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
26 | 	</natures>
27 | </projectDescription>
28 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/.settings/org.eclipse.ltk.core.refactoring.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false
3 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/Debug/objects.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 | 
5 | USER_OBJS :=
6 | 
7 | LIBS :=
8 | 
9 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/Debug/sources.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | O_SRCS := 
 6 | CPP_SRCS := 
 7 | C_UPPER_SRCS := 
 8 | C_SRCS := 
 9 | S_UPPER_SRCS := 
10 | OBJ_SRCS := 
11 | CU_SRCS := 
12 | ASM_SRCS := 
13 | CXX_SRCS := 
14 | C++_SRCS := 
15 | CC_SRCS := 
16 | CU_DEPS := 
17 | OBJS := 
18 | C++_DEPS := 
19 | C_DEPS := 
20 | CC_DEPS := 
21 | CPP_DEPS := 
22 | EXECUTABLES := 
23 | CXX_DEPS := 
24 | C_UPPER_DEPS := 
25 | 
26 | # Every subdirectory with source files must be described here
27 | SUBDIRS := \
28 | src \
29 | 
30 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/Debug/src/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CU_SRCS += \
 7 | ../src/simple_sgemm.cu 
 8 | 
 9 | CU_DEPS += \
10 | ./src/simple_sgemm.d 
11 | 
12 | OBJS += \
13 | ./src/simple_sgemm.o 
14 | 
15 | 
16 | # Each subdirectory must supply rules for building sources it contributes
17 | src/%.o: ../src/%.cu
18 | 	@echo 'Building file: $<'
19 | 	@echo 'Invoking: NVCC Compiler'
20 | 	/usr/local/cuda-10.0/bin/nvcc -G -g -O0 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75  -odir "src" -M -o "$(@:%.o=%.d)" "$<"
21 | 	/usr/local/cuda-10.0/bin/nvcc -G -g -O0 --compile --relocatable-device-code=false -gencode arch=compute_60,code=compute_60 -gencode arch=compute_61,code=compute_61 -gencode arch=compute_70,code=compute_70 -gencode arch=compute_75,code=compute_75 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75  -x cu -o  "$@" "$<"
22 | 	@echo 'Finished building: $<'
23 | 	@echo ' '
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/06_debug_with_eclipse/java-7-install.md:
--------------------------------------------------------------------------------
 1 | Nsight Eclipse Edition requires java 1.7 for their java runtime engine.
 2 | 
 3 | However, the latest linux platform does not provides java 7 from their support so it is requred to install java 7 manually.
 4 | To tell the truth, JRE is installed with CUDA toolkit. However it installs the latest one and does not resolve the dependency issue. 
 5 | 
 6 | 
 7 | Firstly, download JRE from the oracle's [site](https://www.oracle.com/technetwork/java/javase/downloads/java-archive-downloads-javase7-521261.html).
 8 | 
 9 | Untar the file and move the files into the proper path.
10 | ```bash
11 | $ tar xzf jdk-7u80-linux-x64.tar.gz
12 | $ sudo mkdir /usr/lib/jvm
13 | $ sudo mv jdk1.7.0_80 /usr/lib/jvm
14 | ```
15 | 
16 | In general, the system will use the latest java version. To set to use older java version, select the older version with this command.
17 | 
18 | ```bash
19 | $ sudo update-alternatives --config java
20 | 
21 | ```
22 | 
23 | For example, update-alternatives gives several installed java version.
24 | 
25 | ```
26 | There are 2 choices for the alternative java (providing /usr/bin/java).
27 | 
28 |   Selection    Path                                         Priority   Status
29 | ------------------------------------------------------------
30 | * 0            /usr/lib/jvm/java-11-openjdk-amd64/bin/java   1111      auto mode
31 |   1            /usr/lib/jvm/java-11-openjdk-amd64/bin/java   1111      manual mode
32 |   2            /usr/lib/jvm/jre1.7.0_80/bin/java             1         manual mode
33 | 
34 | Press <enter> to keep the current choice[*], or type selection number:
35 | ```
36 | 
37 | 
38 | Put number 2 in this case to use java 1.7.0.
39 | 
40 | Do this for the rest of JRE runtime file.
41 | ```bash
42 | $ sudo update-alternavies --config javaws
43 | ```
44 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/07_debug_with_gdb/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=simple_sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -G -Xcompiler -rdynamic
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | 
21 | ALL_CCFLAGS += -g $(NVCC_FLAGS) $(INCLUDES)
22 | 
23 | all : ${TARGET}
24 | 
25 | simple_sgemm: simple_sgemm.cu
26 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | clean:
29 | 	rm -f ${TARGET} *.o
30 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/08_memcheck/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=simple_sgemm simple_sgemm_oob simple_sgemm_mem_leak
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lgomp
20 | 
21 | ALL_CCFLAGS += -g -G -Xcompiler -rdynamic $(NVCC_FLAGS) $(INCLUDES)
22 | 
23 | all : ${TARGET}
24 | 
25 | simple_sgemm: simple_sgemm.cu
26 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | simple_sgemm_oob: simple_sgemm_oob.cu
29 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
30 | 
31 | simple_sgemm_mem_leak: simple_sgemm_mem_leak.cu
32 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
33 | 
34 | clean:
35 | 	rm -f ${TARGET} *.o
36 | 


--------------------------------------------------------------------------------
/Chapter05/05_debug_profiling/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
23 | 
24 | test:
25 | 	echo $(DIRECTORY)


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/01_gaussian_single_gpu/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONFIG_H
 2 |     #define CONFIG_H
 3 | 
 4 |     #define INPUT_TYPE RANDOM
 5 | 
 6 |     // Linear system parameters
 7 |     #define ROWS 300           // Number of rows in the system. 
 8 |     #define COLS 256           // Number of columns in the system
 9 |     #define PERCENTAGE 50       // Density of coefficient matrix
10 | 
11 |     #define REFERENCE_SOLUTION "original-matrix"
12 |     #define COMPUTED_SOLUTION  "computed-solution"
13 | 
14 |     #define PACK_SIZE (8*sizeof(unsigned int))
15 | // Ceil of numerator divided by denominator.
16 |     #define intCeilDiv(numerator, denominator) (((numerator) + (denominator) - 1)/ (denominator))
17 | #endif
18 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/02_gaussian_multi_gpu/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | NVCC = nvcc
 3 | 
 4 | ROWS ?= 11000
 5 | COLS ?= 10000
 6 | 
 7 | PARAMS = -DROWS=$(ROWS) -DCOLS=$(COLS)
 8 | 
 9 | CFLAGS = -O3 $(PARAMS) -I$(CUDA_PATH)/include/ 
10 | #NV_CFLAGS = -gencode arch=compute_20,code=sm_20 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_60,code=sm_60 -O3 --ptxas-options=-v -lineinfo $(PARAMS) -I$(CUDA_PATH)/include/ -Wno-deprecated-gpu-targets
11 | NV_CFLAGS = -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -O3 -lineinfo $(PARAMS) -I$(CUDA_PATH)/include/ -Wno-deprecated-gpu-targets
12 | 
13 | BINARY = gaussian_multi_gpu_p2p.out
14 | all: $(BINARY)
15 | 
16 | OBJECTS = gaussian_multi_gpu_p2p.o
17 | 
18 | $(BINARY): $(OBJECTS)
19 | 	$(NVCC) $(NV_CFLAGS) -dlink $(OBJECTS) -o  gpuObjectCode.o
20 | 	$(NVCC) $(NV_CFLAGS) gpuObjectCode.o $(OBJECTS) -o $(BINARY)
21 | 
22 | %.o : %.c #default rule for making .o files from .c
23 | 	$(info --- Building '$@' from '$<' using default rule 1)
24 | 	$(CC)  $(CFLAGS) -c -o $@ $<
25 | 
26 | %.o : %.cu #default rule for making .o files from .cu
27 | 	$(info --- Building '$@' from '$<' using default rule 2)
28 | 	$(NVCC) $(NV_CFLAGS) -dc -o $@ $<
29 | 
30 | clean:
31 | 	rm -rf *o $(BINARY)
32 | 
33 | run: $(BINARY)
34 | 	./$(BINARY)
35 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/02_gaussian_multi_gpu/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONFIG_H
 2 | #define CONFIG_H
 3 | 
 4 | #define INPUT_TYPE RANDOM
 5 | 
 6 | // Linear system parameters
 7 | #define ROWS 300           // Number of rows in the system.
 8 | #define COLS 256           // Number of columns in the system
 9 | #define PERCENTAGE 50       // Density of coefficient matrix
10 | 
11 | #define REFERENCE_SOLUTION "original-matrix"
12 | #define COMPUTED_SOLUTION  "computed-solution"
13 | 
14 | #define PACK_SIZE (8*sizeof(unsigned int))
15 | // Ceil of numerator divided by denominator.
16 | #define intCeilDiv(numerator, denominator) (((numerator) + (denominator) - 1)/ (denominator))
17 | 
18 | // How many GPUs should be used by solver. Effective only with MULTI_GPU
19 | #define NUMBER_OF_GPU 2
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/03_helloWorldMPI/helloWorldMPI.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | int main(int argc, char *argv[]) {
 4 |         int rank,size;
 5 |         /* Initialize the MPI library */
 6 |         MPI_Init(&argc,&argv);
 7 |         /* Determine the calling process rank and total number of ranks */
 8 |         MPI_Comm_rank(MPI_COMM_WORLD,&rank);
 9 |         MPI_Comm_size(MPI_COMM_WORLD,&size);
10 |         /* Compute based on process rank */
11 |         /* Call MPI routines like MPI_Send, MPI_Recv, ... */
12 |         printf("\n Rank %d, Size %d", rank,size);
13 |         /* Shutdown MPI library */
14 |                 MPI_Finalize();
15 |         return 0;
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/04_gaussian_multi_node/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | NVCC = nvcc
 3 | MPICC = mpiCC
 4 | 
 5 | MPIRUN = mpirun
 6 | CUDA_PATH = /usr/local/cuda
 7 | 
 8 | ROWS ?= 11000
 9 | COLS ?= 10000
10 | 
11 | PARAMS = -DROWS=$(ROWS) -DCOLS=$(COLS)
12 | 
13 | HOSTFILE ?= myHosts
14 | 
15 | CFLAGS = -O3 $(PARAMS) -I$(CUDA_PATH)/include/ -I$(MPI_PATH)/include 
16 | INCLUDES = -I$(CUDA_PATH)/include/ -I$(CUDA_PATH)/samples/inc -I$(CUDA_PATH)/targets/x86_64-linux/include
17 | 
18 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
19 | 
20 | # Gencode argumentes
21 | SMS = 35 37 50 52 60 61 70 75
22 | ifeq "$(IS_CUDA_11)" "1"
23 | SMS = 52 60 61 70 75 80
24 | endif
25 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
26 | 
27 | NV_CFLAGS = $(GENCODE_FLAGS) -O3 -lineinfo $(PARAMS) $(INCLUDES) -Wno-deprecated-gpu-targets
28 | 
29 | BINARY = gaussian_multi_gpu_rdma.out
30 | all: $(BINARY)
31 | 
32 | OBJECTS := gaussian_multi_gpu_rdma.o gpuSolver.o gpuSolverFunctions.o linearSystemOps.o utilities.o elementUtilities.o
33 | 
34 | $(BINARY): $(OBJECTS)
35 | 	$(NVCC) $(NV_CFLAGS) -dlink $(OBJECTS) -o  gpuObjectCode.o
36 | 	$(MPICC) gpuObjectCode.o $(OBJECTS) -o $(BINARY) -lcudart -L $(CUDA_PATH)/lib64/
37 | 
38 | %.o : %.c #default rule for making .o files from .c
39 | 	$(info --- Building '$@' from '$<' using default rule 1)
40 | 	$(MPICC)  $(CFLAGS) -c -o $@ $<
41 | 
42 | %.o : %.cu #default rule for making .o files from .cu
43 | 	$(info --- Building '$@' from '$<' using default rule 2)
44 | 	$(NVCC) $(NV_CFLAGS) -dc -o $@ $<
45 | 
46 | clean:
47 | 	rm -rf *o $(BINARY)
48 | 
49 | run: $(BINARY)
50 | 	$(MPIRUN) --hostfile $(HOSTFILE) ./$(BINARY) $(ROWS) $(COLS)
51 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/04_gaussian_multi_node/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONFIG_H
 2 |     #define CONFIG_H
 3 | 
 4 |     #define INPUT_TYPE RANDOM
 5 |     #define PIVOTPACK
 6 |     #define ROWS 33000           // Number of rows in the system
 7 |     #define COLS 30000           // Number of columns in the system
 8 |     #define PERCENTAGE 50       // Density of coefficient matrix. Useful only with INPUT_TYPE set to RANDOM
 9 | 
10 | 
11 | 
12 |     #define REFERENCE_SOLUTION "original-matrix"
13 |     #define COMPUTED_SOLUTION  "computed-solution"
14 | 
15 |     // Chose one of the two
16 |     // 32 consecutive matrix elements are packed together in an unsigned int
17 |     // #define ELEMENT_TYPE_UINT
18 |     // 128 consecutive matrix elements are packed together in an uint4
19 |        #define ELEMENT_TYPE_UINT4
20 | #endif
21 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/04_gaussian_multi_node/mpiUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef MPIUTILS_H
 2 | #define MPIUTILS_H
 3 | 
 4 | #include <mpi.h>
 5 | 
 6 | #define MPI_CHECK(call) \
 7 |     if((call) != MPI_SUCCESS) { \
 8 |         printf("MPI error calling \"%s\"\n", #call); \
 9 |         MPI_Abort(MPI_COMM_WORLD, -1); }
10 | 
11 | #endif
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/05_streams/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=vector_addition merging_multi_gpu
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo --default-stream per-thread -Xcompiler -fopenmp # --resource-usage 
 8 | 
 9 | LIBRARIES= -lgomp
10 | 
11 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
12 | 
13 | # Gencode argumentes
14 | SMS = 35 37 50 52 60 61 70 75
15 | ifeq "$(IS_CUDA_11)" "1"
16 | SMS = 52 60 61 70 75 80
17 | endif
18 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
19 | 
20 | all: ${TARGET}
21 | 
22 | vector_addition: vector_addition.cu
23 | 	$(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
24 | 
25 | merging_multi_gpu: image_merging.cu scrImagePgmPpmPackage.cu
26 | 	$(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | clean:
29 | 	rm -f ${TARGET} *.o
30 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/05_streams/cat.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter06/06_multigpu/05_streams/cat.pgm


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/05_streams/dog.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter06/06_multigpu/05_streams/dog.pgm


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/05_streams/scrImagePgmPpmPackage.h:
--------------------------------------------------------------------------------
 1 | #ifndef _Xiang_Gao_PGM_PPM_Header_
 2 | #define _Xiang_Gao_PGM_PPM_Header_
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | 
 7 |  int scr_read_pgm( char* name, unsigned char* image, int irows, int icols );
 8 |  void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment );
 9 |  int scr_read_ppm( char* name, unsigned char* image, int irows, int icols );
10 |  void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment );
11 |  void get_PgmPpmParams(char * , int *, int *);
12 |  void getout_comment(FILE * );
13 | #endif
14 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/06_nccl/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=nccl
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo -lnccl # --resource-usage  
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | all: ${TARGET}
19 | 
20 | nccl: nccl.cu
21 | 	$(EXEC) $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
22 | 
23 | clean:
24 | 	rm -f ${TARGET} *.o
25 | 


--------------------------------------------------------------------------------
/Chapter06/06_multigpu/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
23 | 
24 | test:
25 | 	echo $(DIRECTORY)


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/01_sgemm_optimization/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=sgemm
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -lgomp
19 | ALL_CCFLAGS += -std=c++11 -Xcompiler -fopenmp -rdc=true $(NVCC_FLAGS) $(INCLUDES)
20 | 
21 | all : ${TARGET}
22 | 	
23 | sgemm: sgemm.cu
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
25 | 
26 | nvprof: sgemm
27 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
28 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o
32 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/02_convolution/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=convolution
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS= -lineinfo --maxrregcount=48 --resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib -lgomp
19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES)
20 | 
21 | all : ${TARGET}
22 | 
23 | convolution: convolution.cu
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
25 | 
26 | nvprof: convolution
27 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
28 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/03_scan/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=scan
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 --resource-usage -lineinfo
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib
19 | ALL_CCFLAGS += -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES)
20 | 
21 | all : ${TARGET}
22 | 
23 | scan_v1.o: scan_v1.cu
24 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
25 | 
26 | scan_v2.o: scan_v2.cu
27 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
28 | 
29 | scan: scan.cu scan_v1.o scan_v2.o
30 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
31 | 
32 | nvprof: scan
33 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
34 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
35 | 
36 | clean:
37 | 	rm -f ${TARGET} *.o *.nvvp
38 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/03_scan/scan.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SCAN_H_
 2 | #define _SCAN_H_
 3 | 
 4 | #define BLOCK_DIM 512
 5 | 
 6 | #define DEBUG_INDEX         0
 7 | #define DEBUG_OUTPUT_NUM    16
 8 | 
 9 | void scan_v1(float *d_output, float *d_input, int length);
10 | void scan_v2(float *d_output, float *d_input, int length);
11 | 
12 | #endif // _SCAN_H_


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/03_scan/scan_v1.cu:
--------------------------------------------------------------------------------
 1 | #include "scan.h"
 2 | 
 3 | __global__ void
 4 | scan_v1_kernel(float *d_output, float *d_input, int length)
 5 | {
 6 |     int idx = blockDim.x * blockIdx.x + threadIdx.x;
 7 |         
 8 |     float element = 0.f;
 9 |     for (int offset = 0; offset < length; offset++) {
10 |         if (idx - offset >= 0)
11 |             element += d_input[idx - offset];
12 |     }
13 |     d_output[idx] = element;
14 | }
15 | 
16 | void scan_v1(float *d_output, float *d_input, int length)
17 | {
18 |     dim3 dimBlock(BLOCK_DIM);
19 |     dim3 dimGrid((length + BLOCK_DIM - 1) / BLOCK_DIM);
20 |     scan_v1_kernel<<<dimGrid, dimBlock>>>(d_output, d_input, length);
21 | }


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/03_scan/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef _UTILS_H_
 2 | #define _UTILS_H_
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <stdio.h>
 6 | #include <stdarg.h>
 7 | 
 8 | // generate input data
 9 | void generate_data(float *ptr, int length)
10 | {
11 |     // fill the buffer with random generated unsigned integers
12 |     for (int i = 0; i < length; i++)
13 |         ptr[i] = (rand() - RAND_MAX/2) / (float)RAND_MAX;
14 | }
15 | 
16 | bool validation(float *a, float *b, int length)
17 | {
18 |     float epsilon = 0.000001;
19 |     bool result = true;
20 |     for (int i = 0; i < length; i++) {
21 |         if (abs(a[i] - b[i]) >= epsilon) {
22 |             result = false;
23 |             printf("result mismatch on %d th item. (%f) \n", i, abs(a[i] - b[i]));
24 |         }
25 |     }
26 |     return result;
27 | }
28 | 
29 | void print_val(float *h_list, int length, ...)
30 | {
31 |     va_list argptr;
32 |     va_start(argptr, length);
33 | 
34 |     printf("%s\t", va_arg(argptr, char *));
35 |     for (int i = 0; i < length; i++)
36 |         printf("%7.4f\t", h_list[i]);
37 |     printf("\n");
38 | }
39 | 
40 | #endif  // _UTILS_H_


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/04_pack_n_split/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=pack_n_split
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -G # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib 
19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) $(LIBRARIES)
20 | 
21 | all : ${TARGET}
22 | 
23 | pack_n_split: pack_n_split.cu
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
25 | 
26 | nvprof: pack_n_split
27 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
28 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/05_n-body/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=n-body
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib 
19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) $(LIBRARIES)
20 | 
21 | all : ${TARGET}
22 | 
23 | n-body: n_body.cu
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
25 | 
26 | nvprof: n-body
27 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
28 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o *.nvvp
32 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/05_n-body/n_body.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #define BLOCK_SIZE 128
 3 | #define SOFTENING 1e-9f
 4 | 
 5 | 
 6 | typedef struct {
 7 |         float4 *pos, *vel;
 8 | } NBodySystem;
 9 | 
10 | void generateRandomizeBodies(float *data, int n);
11 | __global__ void calculateBodyForce(float4 *p, float4 *v, float dt, int n);
12 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/06_quicksort/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=quick_sort
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | ALL_CCFLAGS += -rdc=true $(NVCC_FLAGS) $(INCLUDES)
19 | 
20 | all : ${TARGET}
21 | 	
22 | quick_sort: quick_sort.cu
23 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
24 | 
25 | nvprof: sgemm
26 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
27 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
28 | 
29 | clean:
30 | 	rm -f ${TARGET} *.o
31 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/07_radixsort/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=radix_warp_sort thrust_radix_sort
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | ALL_CCFLAGS += -rdc=true $(NVCC_FLAGS) $(INCLUDES)
19 | 
20 | all : ${TARGET}
21 | 	
22 | radix_warp_sort: radix_warp_sort.cu
23 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
24 | 
25 | thrust_radix_sort: thrust_radix_sort.cu
26 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
27 | 
28 | nvprof: sgemm
29 | 	nvprof -f -o $+.nvvp --profile-from-start off ./$+
30 | 	nvprof -f -o $+-metrics.nvvp --analysis-metrics ./$+
31 | 
32 | clean:
33 | 	rm -f ${TARGET} *.o
34 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/07_radixsort/thrust_radix_sort.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/device_vector.h>
 2 | #include <thrust/sort.h>
 3 | #include <thrust/random.h>
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | 
 7 | // Helper routines
 8 | void initialize(thrust::device_vector<int>& v)
 9 | {
10 |   thrust::default_random_engine rng(123456);
11 |   thrust::uniform_int_distribution<int> dist(10, 99);
12 |   for(size_t i = 0; i < v.size(); i++)
13 |     v[i] = dist(rng);
14 | }
15 | 
16 | void print(const thrust::device_vector<int>& v)
17 | {
18 |   for(size_t i = 0; i < v.size(); i++)
19 |     std::cout << " " << v[i];
20 |   std::cout << "\n";
21 | }
22 | 
23 | 
24 | int main(void)
25 | {
26 |   size_t N = 16;
27 | 
28 |   std::cout << "sorting integers\n";
29 |   {
30 |     thrust::device_vector<int> keys(N);
31 |     initialize(keys);
32 |     print(keys);
33 |     thrust::sort(keys.begin(), keys.end());
34 |     print(keys);
35 |   }
36 |   
37 |   std::cout << "\nsorting integers (descending)\n";
38 |   {
39 |     thrust::device_vector<int> keys(N);
40 |     initialize(keys);
41 |     print(keys);
42 |     thrust::sort(keys.begin(), keys.end(), thrust::greater<int>());
43 |     print(keys);
44 |   }
45 |   
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/08_histogram/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=image_histogram
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 -lineinfo #--resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | ALL_CCFLAGS += -rdc=true $(NVCC_FLAGS) $(INCLUDES)
19 | 
20 | all : ${TARGET}
21 | 	
22 | image_histogram: scrImagePgmPpmPackage.cpp image_histogram.cu
23 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $+ $(LIBRARIES)
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
25 | 
26 | clean:
27 | 	rm -f ${TARGET} *.o
28 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/08_histogram/aerosmith-double.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter07/07_parallel_programming_pattern/08_histogram/aerosmith-double.pgm


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/08_histogram/scrImagePgmPpmPackage.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | 
 4 | 
 5 |  int scr_read_pgm( char* name, unsigned char* image, int irows, int icols );
 6 |  void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment );
 7 |  int scr_read_ppm( char* name, unsigned char* image, int irows, int icols );
 8 |  void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment );
 9 |  void get_PgmPpmParams(char * , int *, int *);
10 |  void getout_comment(FILE * );
11 | 


--------------------------------------------------------------------------------
/Chapter07/07_parallel_programming_pattern/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | .PHONY: clean
23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
24 | 
25 | test:
26 | 	echo $(DIRECTORY)
27 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/01_sgemm/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cublasSgemm cublasXtSgemm cublasSgemm_async
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lcublas
20 | ALL_CCFLAGS += -std=c++11
21 | 
22 | all : ${TARGET}
23 | 
24 | cublasSgemm: cublasSgemm.cpp
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
26 | 	
27 | cublasXtSgemm: cublasXtSgemm.cpp
28 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
29 | 
30 | cublasSgemm_async: cublasSgemm_async.cpp
31 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
32 | 
33 | nvprof: cublas
34 | 	nvprof -f -o $+.nvvp ./$+
35 | 
36 | clean:
37 | 	rm -f ${TARGET} *.o


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/02_sgemm_mixed_precision/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cublasGemmEx
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lcublas
20 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib
21 | 
22 | all : ${TARGET}
23 | 
24 | cublasGemmEx: cublasGemmEx.cu
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
26 | 
27 | nvprof: cublas
28 | 	nvprof -f -o $+.nvvp ./$+
29 | 
30 | clean:
31 | 	rm -f ${TARGET} *.o
32 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/03_curand/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=curand_host curand_device gemm_with_curand_host
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lcublas -lcurand
20 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib
21 | 
22 | all : ${TARGET}
23 | 
24 | curand_host: curand_host.cpp
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
26 | 
27 | curand_device: curand_device.cu
28 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
29 | 
30 | fp16.o: fp16.cu
31 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) $(LIBRARIES) -o $@ -c $<
32 | 
33 | gemm_with_curand_host.o: gemm_with_curand_host.cpp
34 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
35 | 
36 | gemm_with_curand_host: gemm_with_curand_host.o fp16.o
37 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
38 | 
39 | nvprof: curand_host
40 | 	nvprof -f -o $+.nvvp ./$+
41 | 
42 | clean:
43 | 	rm -f ${TARGET} *.o
44 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/03_curand/fp16.cu:
--------------------------------------------------------------------------------
 1 | #include "fp16.cuh"
 2 | #include <cuda_fp16.h>
 3 | 
 4 | #define BLOCK_DIM 512
 5 | 
 6 | namespace fp16
 7 | {
 8 | __global__ void float2half_kernel(half *out, float *in)
 9 | {
10 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
11 | 
12 |     out[idx] = __float2half(in[idx]);
13 | }
14 | 
15 | __global__ void half2float_kernel(float *out, half *in)
16 | { 
17 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
18 | 
19 |     out[idx] = __half2float(in[idx]);
20 | }
21 | 
22 | void float2half(half *out, float *in, size_t length)
23 | {
24 |     float2half_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in);
25 | }
26 | 
27 | void half2float(float *out, half *in, size_t length)
28 | {
29 |     half2float_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in);
30 | }
31 | } // namespace fp16


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/03_curand/fp16.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _FP16_CUH_
 2 | #define _FP16_CUH_
 3 | 
 4 | #include <cuda_fp16.h>
 5 | 
 6 | namespace fp16
 7 | {
 8 | void float2half(half *out, float *in, size_t length);
 9 | void half2float(float *out, half *in, size_t lenght);
10 | }
11 | 
12 | #endif // _FP16_CUH_


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/04_cufft/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda/
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=cufft.1d cufft.half cufft.mgpu
 5 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 6 | NVCC_FLAGS=-m64 # --resource-usage
 7 | 
 8 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
 9 | 
10 | # Gencode argumentes
11 | SMS = 35 37 50 52 60 61 70 75
12 | ifeq "$(IS_CUDA_11)" "1"
13 | SMS = 52 60 61 70 75 80
14 | endif
15 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
16 | 
17 | # Openmp
18 | LIBRARIES += -lcufft -lcurand
19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib
20 | 
21 | all : ${TARGET}
22 | 
23 | cufft.1d: cufft.1d.cpp
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
25 | 
26 | complex.o: complex.cu
27 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
28 | 
29 | fp16.o: fp16.cu
30 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
31 | 
32 | cufft.half: cufft.half.cpp complex.o fp16.o
33 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
34 | 
35 | cufft.mgpu: cufft.mgpu.cu complex.o
36 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
37 | 
38 | nvprof: cufft.1d
39 | 	nvprof -f -o $+.nvvp ./$+
40 | 
41 | clean:
42 | 	rm -f ${TARGET} *.o
43 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/04_cufft/complex.cu:
--------------------------------------------------------------------------------
 1 | #include <cufft.h>
 2 | #include "helper.cuh"
 3 | 
 4 | namespace op
 5 | {
 6 | __global__ void FloatToComplex_kernel(cufftComplex *complex, const float *real, const float *imag)
 7 | {
 8 |     int idx = blockDim.x * blockIdx.x + threadIdx.x;
 9 | 
10 |     complex[idx].x = real[idx];
11 |     if (imag != nullptr)
12 |         complex[idx].y = imag[idx];
13 | }
14 | 
15 | void FloatToComplex(cufftComplex *complex, const float *real, const float *imag, const size_t length)
16 | {
17 |     dim3 dimBlock(512);
18 |     dim3 dimGrid((length + dimBlock.x - 1) / dimBlock.x);
19 | 
20 |     FloatToComplex_kernel<<< dimGrid, dimBlock >>>(complex, real, imag);
21 | }
22 | }


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/04_cufft/fp16.cu:
--------------------------------------------------------------------------------
 1 | #include "helper.cuh"
 2 | #include "fp16.cuh"
 3 | #include <cuda_fp16.h>
 4 | 
 5 | #define BLOCK_DIM 512
 6 | 
 7 | namespace fp16
 8 | {
 9 | __global__ void float2half_kernel(half *out, float *in)
10 | {
11 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
12 | 
13 |     out[idx] = __float2half(in[idx]);
14 | }
15 | 
16 | __global__ void half2float_kernel(float *out, half *in)
17 | { 
18 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
19 | 
20 |     out[idx] = __half2float(in[idx]);
21 | }
22 | 
23 | void float2half(half *out, float *in, size_t length)
24 | {
25 |     float2half_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in);
26 | }
27 | 
28 | void half2float(float *out, half *in, size_t length)
29 | {
30 |     half2float_kernel<<< (length + BLOCK_DIM - 1) / BLOCK_DIM, BLOCK_DIM >>>(out, in);
31 | }
32 | } // namespace fp16


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/04_cufft/fp16.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _FP16_CUH_
 2 | #define _FP16_CUH_
 3 | 
 4 | #include <cuda_fp16.h>
 5 | 
 6 | namespace fp16
 7 | {
 8 | void float2half(half *out, float *in, size_t length);
 9 | void half2float(float *out, half *in, size_t lenght);
10 | }
11 | 
12 | #endif // _FP16_CUH_


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/04_cufft/helper.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _HELPER_CU_H_
 2 | #define _HELPER_CU_H_
 3 | 
 4 | #include <curand.h>
 5 | #include <cufft.h>
 6 | #include "fp16.cuh"
 7 | 
 8 | namespace op {
 9 | template <typename T>
10 | typename std::enable_if<std::is_same<T, float>::value>::type
11 | curand(curandGenerator_t generator,
12 |             T *buffer,
13 |             size_t length)
14 | {
15 |     curandGenerateUniform(generator, buffer, length);
16 | }
17 | 
18 | void FloatToComplex(cufftComplex *complex, const float *real, const float *imag, const size_t length);
19 | 
20 | template <typename T>
21 | typename std::enable_if<std::is_same<T, cufftComplex>::value>::type
22 | curand(curandGenerator_t generator,
23 |             T *buffer,
24 |             size_t length)
25 | {
26 |     float *buffer_fp32;
27 | 
28 |     cudaMalloc((void **)&buffer_fp32, length * sizeof(float));
29 |     curandGenerateUniform(generator, buffer_fp32, length);
30 | 
31 |     // convert generated real data into complex type
32 |     FloatToComplex(buffer, buffer_fp32, nullptr, length);
33 |     cudaFree(buffer_fp32);
34 | }
35 | 
36 | template <typename T>
37 | typename std::enable_if<std::is_same<T, half>::value>::type
38 | curand(curandGenerator_t generator,
39 |             T *buffer,
40 |             size_t length)
41 | {
42 |     float *buffer_fp32;
43 | 
44 |     cudaMalloc((void **)&buffer_fp32, length * sizeof(float));
45 |     curandGenerateUniform(generator, buffer_fp32, length);
46 | 
47 |     // convert generated single floating to half floating
48 |     fp16::float2half(buffer, buffer_fp32, length);
49 |     cudaFree(buffer_fp32);
50 | }
51 | }
52 | 
53 | #endif // _HELPER_CU_H_


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/05_npp/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=imageFilter statisticsNPP
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | # Openmp
19 | LIBRARIES += -lnppc -lnppif -lnppisu -lnppig -lnpps -lfreeimage
20 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) -L/usr/local/cuda/lib
21 | 
22 | all : ${TARGET}
23 | 
24 | imageFilter: imageFilter.cpp
25 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
26 | 
27 | statisticsNPP: statisticsNPP.cpp
28 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
29 | 
30 | nvprof: imageFilter
31 | 	nvprof -f -o $+.nvvp ./$+
32 | 
33 | clean:
34 | 	rm -f ${TARGET} *.o
35 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/05_npp/flower.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter08/08_cuda_libs_and_other_languages/05_npp/flower.jpg


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/05_npp/output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter08/08_cuda_libs_and_other_languages/05_npp/output.jpg


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/06_opencv/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=test blur blur_stream
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc $(shell pkg-config opencv4 --cflags)
 7 | NVCC_FLAGS=-m64 # --resource-usage
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib -L/usr/local/lib $(shell pkg-config opencv4 --libs)
19 | ALL_CCFLAGS += -std=c++11 $(INCLUDES) $(LIBRARIES)
20 | 
21 | all : ${TARGET}
22 | 
23 | test: test.cpp
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
25 | 
26 | blur: blur.cpp
27 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
28 | 
29 | blur_stream: blur_stream.cpp
30 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
31 | 
32 | nvprof: blur
33 | 	nvprof -f -o $+.nvvp ./$+
34 | 
35 | clean:
36 | 	rm -f ${TARGET} *.o
37 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/06_opencv/blur.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include "opencv2/opencv.hpp"
 4 | 
 5 | using namespace cv;
 6 | 
 7 | void BlurHost(std::string filename)
 8 | {
 9 |     Mat src = imread(filename, 1);
10 |     Mat dst;
11 | 
12 |     TickMeter tm;
13 |     
14 |     tm.start();
15 |     bilateralFilter(src, dst, 21, 150, 150);
16 |     tm.stop();
17 |     std::cout << "CPU Time: " << tm.getTimeMilli() << " ms." << std::endl;
18 | 
19 |     imwrite("result_host.jpg", dst);
20 | }
21 | 
22 | void BlurCuda(std::string filename)
23 | {
24 |     TickMeter tm;
25 | 
26 |     Mat src = imread(filename, 1);
27 |     Mat dst;
28 |     cuda::GpuMat src_cuda(src);
29 |     cuda::GpuMat dst_cuda;
30 | 
31 |     // warm-up
32 |     cuda::bilateralFilter(src_cuda, dst_cuda, 21, 150.f, 150.f);
33 | 
34 |     tm.start();
35 |     src_cuda.upload(src);
36 |     cuda::bilateralFilter(src_cuda, dst_cuda, 21, 150.f, 150.f);
37 |     dst_cuda.download(dst);
38 |     tm.stop();
39 |     std::cout << "GPU Time: " << tm.getTimeMilli() << " ms." << std::endl;
40 | 
41 |     imwrite("result_cuda.jpg", dst);
42 | }
43 | 
44 | int main(int argc, char *argv[])
45 | {
46 |     cuda::printCudaDeviceInfo(0);
47 |     cuda::printShortCudaDeviceInfo(0);
48 |     std::cout << "Device: " << cuda::getCudaEnabledDeviceCount() << std::endl;
49 | 
50 |     std::string filename("flower.jpg");
51 | 
52 |     
53 |     BlurHost(filename);
54 |     BlurCuda(filename);
55 | 
56 |     return 0;
57 | }


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/06_opencv/flower.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter08/08_cuda_libs_and_other_languages/06_opencv/flower.JPG


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/06_opencv/test.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "opencv2/opencv.hpp"
 3 | #include "opencv2/core/cuda.hpp"
 4 | #include "opencv2/cudafilters.hpp"
 5 | #include "opencv2/cudaimgproc.hpp"
 6 |  
 7 | using namespace cv;
 8 |  
 9 | int main( int argc, char* argv[] )
10 | {
11 |     const int64 start = getTickCount();
12 |  
13 |     cv::Mat src = cv::imread( "flower.jpg", 0 );
14 |  
15 |     if( !src.data ) exit( 1 );
16 |  
17 |     cv::cuda::GpuMat d_src( src );
18 |     cv::cuda::GpuMat d_dst;
19 |  
20 |     cv::cuda::bilateralFilter( d_src, d_dst, -1, 50, 7 );
21 |     Ptr<cuda::CannyEdgeDetector> canny = cuda::createCannyEdgeDetector( 35.0, 200.0 );
22 |     canny->detect( d_src, d_dst );
23 |  
24 |     cv::Mat dst( d_dst );
25 |  
26 |     cv::imwrite( "cuda_canny.png", dst );
27 |  
28 |     const double timeSec = (getTickCount() - start) / getTickFrequency();
29 |     std::cout << "Time : " << timeSec << " sec" << std::endl;
30 |     
31 |     return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/07_python_cuda/cupy_op.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cupy as cp
 3 | 
 4 | # cupy matmul
 5 | a = cp.random.uniform(0, 1, (2, 4)).astype('float32')
 6 | b = cp.random.uniform(0, 1, (4, 2)).astype('float32')
 7 | c = cp.matmul(a, b)
 8 | print("Matrix Multiplication")
 9 | print("a::\n", a)
10 | print("b::\n", b)
11 | print("c = a' * b::", c)
12 | 
13 | # custom kernel
14 | squared_diff = cp.ElementwiseKernel(
15 |     'float32 x, float32 y',
16 |     'float32 z',
17 |     'z = (x - y) * (x - y)',
18 |     'squared_diff')
19 | 
20 | a = cp.random.uniform(0, 1, (2, 4)).astype('float32')
21 | b = cp.random.uniform(0, 1, (2, 4)).astype('float32')
22 | c = squared_diff(a, b)
23 | print("Elements Diff")
24 | print("a::\n", a)
25 | print("b::\n", b)
26 | print("c = (a-b)*(a-b)::", c)
27 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/07_python_cuda/numba_matmul.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numba import cuda
 3 | from timeit import default_timer as timer
 4 | 
 5 | @cuda.jit
 6 | def matmul(d_c, d_a, d_b):
 7 |     x, y = cuda.grid(2)
 8 |     if (x < d_c.shape[0] and y < d_c.shape[1]):
 9 |         sum = 0
10 |         for k in range(d_a.shape[1]):
11 |             sum += d_a[x, k] * d_b[k, y]
12 |         d_c[x, y] = sum
13 | 
14 | # initialize input data
15 | N = 8192
16 | a = np.random.rand(N, N).astype(np.float32)
17 | b = np.random.rand(N, N).astype(np.float32)
18 | 
19 | # copy matrices to the devices
20 | d_a = cuda.to_device(a)
21 | d_b = cuda.to_device(b)
22 | 
23 | # create device memory for matrix c
24 | d_c = cuda.device_array((N, N))
25 | 
26 | # configure the blocks
27 | BLOCK_DIM = 16
28 | dimBlock = (BLOCK_DIM, BLOCK_DIM)
29 | dimGrid = (int((N + BLOCK_DIM - 1) / BLOCK_DIM), 
30 |            int((N +BLOCK_DIM - 1) / BLOCK_DIM))
31 | 
32 | # matrix multiplication (gpu)
33 | start = timer()
34 | matmul[dimGrid, dimBlock](d_c, d_a, d_b)
35 | elapsed_time_gpu = (timer() - start) * 1e3
36 | 
37 | # copy the result back to the host
38 | c = d_c.copy_to_host()
39 | 
40 | # matrix multiplication (cpu)
41 | start = timer()
42 | c_host = np.matmul(a, b)
43 | elapsed_time_cpu = (timer() - start) * 1e3
44 | 
45 | # print elapse times
46 | print("Elapsed Time")
47 | print("GPU: %.3f ms" % elapsed_time_gpu)
48 | print("CPU: %.3f ms" % elapsed_time_cpu)
49 | 
50 | if (np.allclose(c_host, c)):
51 |     print("Done.")
52 | else:
53 |     print("GPU and host results are mismatching.")
54 | 
55 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/07_python_cuda/numba_saxpy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numba import vectorize
 3 | from timeit import default_timer as timer
 4 | 
 5 | @vectorize(["float32(float32, float32, float32)"], target='cuda')
 6 | def saxpy_cuda(scala, a, b):
 7 |     return scala * a + b
 8 | 
 9 | 
10 | #@vectorize(["float32(float32, float32, float32)"], target='cpu')
11 | @vectorize(["float32(float32, float32, float32)"], target='parallel')
12 | def saxpy_host(scala, a, b):
13 |     return scala * a + b
14 | 
15 | scala = 2.0
16 | np.random.seed(2019)
17 | 
18 | print("size \t\t CUDA \t\t CPU")
19 | for i in range(16,20):
20 |     N = 1 << i
21 |     a = np.random.rand(N).astype(np.float32)
22 |     b = np.random.rand(N).astype(np.float32)
23 |     c = np.zeros(N, dtype=np.float32)
24 | 
25 |     # warm-up
26 |     c = saxpy_cuda(scala, a, b)
27 | 
28 |     # measuring execution time
29 |     start = timer()
30 |     c = saxpy_host(scala, a, b)
31 |     elapsed_time_host= (timer() - start) * 1e3
32 | 
33 |     start = timer()
34 |     c = saxpy_cuda(scala, a, b)
35 |     elapsed_time_cuda = (timer() - start) * 1e3
36 | 
37 |     print("[%d]: \t%.3f ms\t %.3f ms" % (N, elapsed_time_cuda, elapsed_time_host))
38 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/08_nvblas/exec_fft.m:
--------------------------------------------------------------------------------
 1 | # FFT
 2 | 
 3 | num_sample = 8192
 4 | x = single(rand(num_sample));
 5 | n_fft = 2^nextpow2(num_sample);
 6 | 
 7 | start = clock();
 8 | y = fft(x, n_fft);
 9 | ix = ifft(y, n_fft);
10 | elapsedTime = etime(clock(), start);
11 | 
12 | printf("Elapsed Time: %.3f ms\n", elapsedTime);
13 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/08_nvblas/fft.R:
--------------------------------------------------------------------------------
1 | # FFT using R
2 | 
3 | x <- 1:2^30
4 | elapsedTime = system.time({
5 |     fft(fft(x), inverse = TRUE)/length(x)
6 | })[3]
7 | print(sprintf("Elapsed Time: %3.3f ms", elapsedTime))


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/08_nvblas/nvblas.conf:
--------------------------------------------------------------------------------
 1 | #Put here the CPU BLAS fallback Library of your choice
 2 | NVBLAS_CPU_BLAS_LIB libopenblas.so
 3 | 
 4 | # Specify which output log file (default is stderr)
 5 | NVBLAS_LOGFILE nvblas.log
 6 | 
 7 | # List of GPU devices Id to participate to the computation 
 8 | # By default if no GPU are listed, only device 0 will be used
 9 | NVBLAS_GPU_LIST 0
10 | NVBLAS_AUTOPIN_MEM_ENABLED
11 | 


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/08_nvblas/sgemm.R:
--------------------------------------------------------------------------------
1 | # Matrix Multiplication using R
2 | for(i in seq(1:6)) {
3 |     N = 512*(2^i)
4 |     A = matrix(rnorm(N^2, mean=0, sd=1), nrow=N) 
5 |     B = matrix(rnorm(N^2, mean=0, sd=1), nrow=N) 
6 |     elapsedTime = system.time({C = A %*% B})[3]
7 |     gFlops = 2*N*N*N/(elapsedTime * 1e+9);
8 |     print(sprintf("Elapsed Time [%d]: %3.3f ms, %.3f GFlops", N, elapsedTime, gFlops))
9 | }


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/08_nvblas/sgemm.m:
--------------------------------------------------------------------------------
 1 | # SGEMM
 2 | 
 3 | for i = 1:6 
 4 |     N = 512*(2^i);
 5 |     A = single(rand(N,N));
 6 |     B = single(rand(N,N));
 7 | 
 8 |     start = clock();
 9 |     C = A * B;
10 |     elapsedTime = etime(clock(), start);
11 | 
12 |     gFlops = 2*N*N*N/(elapsedTime * 1e+9);
13 |     printf("Elapsed Time [%d]: %.3f ms, %.3f GFlops\n", N, elapsedTime, gFlops);
14 | end


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/09_matlab/cuda.m:
--------------------------------------------------------------------------------
 1 | N = 8192;
 2 | A = single(rand(N,N));
 3 | B = single(rand(N,N));
 4 | 
 5 | d_A = gpuArray(A);
 6 | d_B = gpuArray(B);
 7 | 
 8 | start = clock();
 9 | % C = A * B; 
10 | d_C = d_A * d_B;
11 | elapsedTime = etime(clock(), start);
12 | 
13 | gFlops = 2*N*N*N/(elapsedTime * 1e+9);
14 | fprintf("Elapsed Time: %.3f ms, %.3f GFlops\n", elapsedTime, gFlops);


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/09_matlab/host.m:
--------------------------------------------------------------------------------
 1 | N = 8192;
 2 | A = single(rand(N,N));
 3 | B = single(rand(N,N));
 4 | 
 5 | start = clock();
 6 | C = A * B; 
 7 | elapsedTime = etime(clock(), start);
 8 | 
 9 | gFlops = 2*N*N*N/(elapsedTime * 1e+9);
10 | fprintf("Elapsed Time: %.3f ms, %.3f GFlops\n", elapsedTime, gFlops);


--------------------------------------------------------------------------------
/Chapter08/08_cuda_libs_and_other_languages/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | .PHONY: clean
23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
24 | 
25 | test:
26 | 	echo $(DIRECTORY)
27 | 


--------------------------------------------------------------------------------
/Chapter09/09_openacc/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | PGCXX=pgc++
 3 | TARGET=merging
 4 | 
 5 | PGCXX_FLAGS= -acc -ta=tesla:managed -Minfo=accel
 6 | 
 7 | all: ${TARGET}
 8 | 
 9 | merging: image_merging.cpp scrImagePgmPpmPackage.cpp
10 | 	$(EXEC) $(PGCXX) -o $@ $+ 
11 | 
12 | clean:
13 | 	rm -f ${TARGET} *.o
14 | 


--------------------------------------------------------------------------------
/Chapter09/09_openacc/cat.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter09/09_openacc/cat.pgm


--------------------------------------------------------------------------------
/Chapter09/09_openacc/dog.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter09/09_openacc/dog.pgm


--------------------------------------------------------------------------------
/Chapter09/09_openacc/scrImagePgmPpmPackage.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | 
 4 | 
 5 |  int scr_read_pgm( char* name, unsigned char* image, int irows, int icols );
 6 |  void scr_write_pgm( char* name, unsigned char* image, int rows, int cols, char* comment );
 7 |  int scr_read_ppm( char* name, unsigned char* image, int irows, int icols );
 8 |  void scr_write_ppm( char* name, unsigned char* image, int rows, int cols, char* comment );
 9 |  void get_PgmPpmParams(char * , int *, int *);
10 |  void getout_comment(FILE * );
11 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/01_ann/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=train
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I$(CUDA_PATH)/include
 7 | NVCC_FLAGS=-G --resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true -lnvToolsExt
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib -lcublas -lcudnn -lgomp -lcurand
19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES)
20 | 
21 | SRC_DIR = src
22 | OBJ_DIR = obj
23 | 
24 | all : ${TARGET}
25 | 
26 | INCS = ${SRC_DIR}/helper.h ${SRC_DIR}/blob.h ${SRC_DIR}/blob.h ${SRC_DIR}/layer.h
27 | 
28 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cpp ${INCS}
29 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@
30 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cu ${INCS}
31 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@
32 | 
33 | ${OBJ_DIR}/train.o: train.cpp ${INCS}
34 | 	@mkdir -p $(@D)
35 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@
36 | 
37 | OBJS = ${OBJ_DIR}/train.o ${OBJ_DIR}/mnist.o ${OBJ_DIR}/loss.o ${OBJ_DIR}/layer.o ${OBJ_DIR}/network.o 
38 | 
39 | train: $(OBJS)
40 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
41 | 
42 | .PHONY: clean
43 | clean:
44 | 	rm -f ${TARGET} ${OBJ_DIR}/*.o
45 | 
46 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/01_ann/download_mnist.bat:
--------------------------------------------------------------------------------
 1 | echo off
 2 | REM setting variables
 3 | set zipPath="C:\Program Files\7-Zip\7z.exe"
 4 | set train_images="train-images-idx3-ubyte.gz"
 5 | set train_labels="train-labels-idx1-ubyte.gz"
 6 | set test_images="t10k-images-idx3-ubyte.gz"
 7 | set test_labels="t10k-labels-idx1-ubyte.gz"
 8 | set url_base="http://yann.lecun.com/exdb/mnist"
 9 | 
10 | REM check if 7-zip installed
11 | IF NOT EXIST %zipPath% GOTO NO_7ZIP
12 | 
13 | REM create dataset folder for the datasets
14 | mkdir dataset
15 | cd dataset
16 | 
17 | REM download datasets
18 | curl -O %url_base%/%train_images%
19 | %zipPath% e .\train-images-idx3-ubyte.gz
20 | curl -O %url_base%/train-labels-idx1-ubyte.gz
21 | %zipPath% e .\train-labels-idx1-ubyte.gz
22 | curl -O %url_base%/t10k-images-idx3-ubyte.gz
23 | %zipPath% e .\t10k-images-idx3-ubyte.gz
24 | curl -O %url_base%/t10k-labels-idx1-ubyte.gz
25 | %zipPath% e .\t10k-labels-idx1-ubyte.gz
26 | 
27 | exit
28 | 
29 | REM exception: no 7-zip found
30 | :NO_7ZIP
31 | echo "Please install 7-zip to extract downloaded MNIST dataset"
32 | exit /b 1


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/01_ann/download_mnist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | url_base=http://yann.lecun.com/exdb/mnist
 4 | 
 5 | mkdir -p dataset
 6 | cd dataset
 7 | 
 8 | curl -O ${url_base}/train-images-idx3-ubyte.gz
 9 | curl -O ${url_base}/train-labels-idx1-ubyte.gz
10 | curl -O ${url_base}/t10k-images-idx3-ubyte.gz
11 | curl -O ${url_base}/t10k-labels-idx1-ubyte.gz
12 | 
13 | gunzip *.gz


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/01_ann/src/loss.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LOSS_H_
 2 | #define _LOSS_H_
 3 | 
 4 | #include "blob.h"
 5 | 
 6 | namespace cudl
 7 | {
 8 | 
 9 | class CrossEntropyLoss
10 | {
11 | public:
12 |     CrossEntropyLoss();
13 |     ~CrossEntropyLoss();
14 | 
15 |     float loss(Blob<float> *predict, Blob<float> *target);
16 |     float accuracy(Blob<float> *predict, Blob<float> *target);
17 | 
18 | private:
19 |     // reduced loss
20 |     float h_loss_ = 0.f;
21 |     float *d_loss_ = nullptr;
22 | 
23 |     float *d_workspace_ = nullptr;
24 |     void init_workspace(int batch_size);
25 | };
26 | 
27 | } // namespace cudl
28 | 
29 | #endif // _LOSS_H_


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/01_ann/src/network.h:
--------------------------------------------------------------------------------
 1 | #ifndef _NETWORK_H_
 2 | #define _NETWORK_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include <cudnn.h>
 8 | 
 9 | #include "helper.h"
10 | #include "loss.h"
11 | #include "layer.h"
12 | 
13 | namespace cudl {
14 | 
15 | typedef enum {
16 |     training,
17 |     inference
18 | } WorkloadType;
19 | 
20 | class Network
21 | {
22 |     public:
23 |     Network();
24 |     ~Network();
25 | 
26 |     void add_layer(Layer *layer);
27 | 
28 |     Blob<float> *forward(Blob<float> *input);
29 |     void backward(Blob<float> *input = nullptr);
30 |     void update(float learning_rate = 0.02f);
31 | 
32 |     int load_pretrain();
33 |     int write_file();
34 | 
35 |     float loss(Blob<float> *target);
36 |     int get_accuracy(Blob<float> *target);
37 | 
38 |     void cuda();
39 |     void train();
40 |     void test();
41 | 
42 |     Blob<float> *output_;
43 | 
44 |     std::vector<Layer *> layers();
45 | 
46 | 
47 |   private:
48 |     std::vector<Layer *> layers_;
49 | 
50 |     CudaContext *cuda_ = nullptr;
51 | 
52 |     WorkloadType phase_ = inference;
53 | };
54 | 
55 | } // namespace cudl
56 | 
57 | 
58 | #endif // _NETWORK_H_


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/02_cnn/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=train
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I$(CUDA_PATH)/include
 7 | NVCC_FLAGS=-G --resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true -lnvToolsExt
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib -lcublas -lcudnn -lgomp -lcurand
19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES)
20 | 
21 | SRC_DIR = src
22 | OBJ_DIR = obj
23 | 
24 | all : ${TARGET}
25 | 
26 | INCS = ${SRC_DIR}/helper.h ${SRC_DIR}/blob.h ${SRC_DIR}/blob.h ${SRC_DIR}/layer.h
27 | 
28 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cpp ${INCS}
29 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@
30 | ${OBJ_DIR}/%.o: ${SRC_DIR}/%.cu ${INCS}
31 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@
32 | 
33 | ${OBJ_DIR}/train.o: train.cpp ${INCS}
34 | 	@mkdir -p $(@D)
35 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -c $< -o $@
36 | 
37 | OBJS = ${OBJ_DIR}/train.o ${OBJ_DIR}/mnist.o ${OBJ_DIR}/loss.o ${OBJ_DIR}/layer.o ${OBJ_DIR}/network.o 
38 | 
39 | train: $(OBJS)
40 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
41 | 
42 | .PHONY: clean
43 | clean:
44 | 	rm -f ${TARGET} ${OBJ_DIR}/*.o
45 | 
46 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/02_cnn/download_mnist.bat:
--------------------------------------------------------------------------------
 1 | echo off
 2 | REM setting variables
 3 | set zipPath="C:\Program Files\7-Zip\7z.exe"
 4 | set train_images="train-images-idx3-ubyte.gz"
 5 | set train_labels="train-labels-idx1-ubyte.gz"
 6 | set test_images="t10k-images-idx3-ubyte.gz"
 7 | set test_labels="t10k-labels-idx1-ubyte.gz"
 8 | set url_base="http://yann.lecun.com/exdb/mnist"
 9 | 
10 | REM check if 7-zip installed
11 | IF NOT EXIST %zipPath% GOTO NO_7ZIP
12 | 
13 | REM create dataset folder for the datasets
14 | mkdir dataset
15 | cd dataset
16 | 
17 | REM download datasets
18 | curl -O %url_base%/%train_images%
19 | %zipPath% e .\train-images-idx3-ubyte.gz
20 | curl -O %url_base%/train-labels-idx1-ubyte.gz
21 | %zipPath% e .\train-labels-idx1-ubyte.gz
22 | curl -O %url_base%/t10k-images-idx3-ubyte.gz
23 | %zipPath% e .\t10k-images-idx3-ubyte.gz
24 | curl -O %url_base%/t10k-labels-idx1-ubyte.gz
25 | %zipPath% e .\t10k-labels-idx1-ubyte.gz
26 | 
27 | exit
28 | 
29 | REM exception: no 7-zip found
30 | :NO_7ZIP
31 | echo "Please install 7-zip to extract downloaded MNIST dataset"
32 | exit /b 1


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/02_cnn/download_mnist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | url_base=http://yann.lecun.com/exdb/mnist
 4 | 
 5 | mkdir -p dataset
 6 | cd dataset
 7 | 
 8 | curl -O ${url_base}/train-images-idx3-ubyte.gz
 9 | curl -O ${url_base}/train-labels-idx1-ubyte.gz
10 | curl -O ${url_base}/t10k-images-idx3-ubyte.gz
11 | curl -O ${url_base}/t10k-labels-idx1-ubyte.gz
12 | 
13 | gunzip *.gz


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/02_cnn/src/loss.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LOSS_H_
 2 | #define _LOSS_H_
 3 | 
 4 | #include "blob.h"
 5 | 
 6 | namespace cudl
 7 | {
 8 | 
 9 | class CrossEntropyLoss
10 | {
11 | public:
12 |     CrossEntropyLoss();
13 |     ~CrossEntropyLoss();
14 | 
15 |     float loss(Blob<float> *predict, Blob<float> *target);
16 |     float accuracy(Blob<float> *predict, Blob<float> *target);
17 | 
18 | private:
19 |     // reduced loss
20 |     float h_loss_ = 0.f;
21 |     float *d_loss_ = nullptr;
22 | 
23 |     float *d_workspace_ = nullptr;
24 |     void init_workspace(int batch_size);
25 | };
26 | 
27 | } // namespace cudl
28 | 
29 | #endif // _LOSS_H_


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/02_cnn/src/network.h:
--------------------------------------------------------------------------------
 1 | #ifndef _NETWORK_H_
 2 | #define _NETWORK_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include <cudnn.h>
 8 | 
 9 | #include "helper.h"
10 | #include "loss.h"
11 | #include "layer.h"
12 | 
13 | namespace cudl {
14 | 
15 | typedef enum {
16 |     training,
17 |     inference
18 | } WorkloadType;
19 | 
20 | class Network
21 | {
22 |     public:
23 |     Network();
24 |     ~Network();
25 | 
26 |     void add_layer(Layer *layer);
27 | 
28 |     Blob<float> *forward(Blob<float> *input);
29 |     void backward(Blob<float> *input = nullptr);
30 |     void update(float learning_rate = 0.02f);
31 | 
32 |     int load_pretrain();
33 |     int write_file();
34 | 
35 |     float loss(Blob<float> *target);
36 |     int get_accuracy(Blob<float> *target);
37 | 
38 |     void cuda();
39 |     void train();
40 |     void test();
41 | 
42 |     Blob<float> *output_;
43 | 
44 |     std::vector<Layer *> layers();
45 | 
46 | 
47 |   private:
48 |     std::vector<Layer *> layers_;
49 | 
50 |     CudaContext *cuda_ = nullptr;
51 | 
52 |     WorkloadType phase_ = inference;
53 | };
54 | 
55 | } // namespace cudl
56 | 
57 | 
58 | #endif // _NETWORK_H_


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/03_rnn/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_PATH=/usr/local/cuda
 2 | HOST_COMPILER ?= g++
 3 | NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
 4 | TARGET=rnn
 5 | 
 6 | INCLUDES = -I${CUDA_PATH}/samples/common/inc -I$(CUDA_PATH)/include
 7 | NVCC_FLAGS=--resource-usage -Xcompiler -rdynamic -Xcompiler -fopenmp -rdc=true 
 8 | 
 9 | IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
10 | 
11 | # Gencode argumentes
12 | SMS = 35 37 50 52 60 61 70 75
13 | ifeq "$(IS_CUDA_11)" "1"
14 | SMS = 52 60 61 70 75 80
15 | endif
16 | $(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
17 | 
18 | LIBRARIES += -L/usr/local/cuda/lib -lcublas -lcudnn -lgomp -lcurand
19 | ALL_CCFLAGS += -m64 -g -std=c++11 $(NVCC_FLAGS) $(INCLUDES) $(LIBRARIES)
20 | 
21 | all : ${TARGET}
22 | 
23 | rnn: rnn.cpp
24 | 	$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+
25 | 
26 | clean:
27 | 	rm -f ${TARGET} ${OBJ_DIR}/*.o
28 | 
29 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/README.md:
--------------------------------------------------------------------------------
1 | PyTorch Training profile with nvprof and NVTX.
2 | 
3 | To simplify the working environment configuration, the profile commands depend on NGC PyTorch container. If your working environment is not ready to use NGC, please visit https://ngc.nvidia.com and configure your working environment following the NGC user guide.


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/GPU_1.log:
--------------------------------------------------------------------------------
 1 | => creating model '('resnet50', 'classic')'
 2 | Version: {'net': <class 'image_classification.resnet.ResNet'>, 'block': <class 'image_classification.resnet.Bottleneck'>, 'layers': [3, 4, 6, 3], 'num_classes': 1000}
 3 | Config: {'conv': <class 'torch.nn.modules.conv.Conv2d'>, 'conv_init': 'fan_out', 'nonlinearity': 'relu', 'last_bn_0_init': False, 'activation': <function <lambda> at 0x7f5e241dc378>}
 4 | WARNING: `HostDecoderRandomCrop` is now deprecated. Use `ImageDecoderRandomCrop` instead
 5 | read 1281167 files from 1000 directories
 6 | WARNING: `nvJPEGDecoder` is now deprecated. Use `ImageDecoder` instead
 7 | read 50000 files from 1000 directories
 8 |  ! Weight decay NOT applied to BN parameters 
 9 | 98
10 | 63
11 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_1GPU.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 training in FP16 on 1 GPUs using 256 batch size (256 per GPU)
2 | # Usage ./RN50_FP16_1GPU.sh <path to this repository> <additional flags>
3 | 
4 | python $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.1 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_4GPU.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU)
2 | # Usage ./RN50_FP16_4GPU.sh <path to this repository> <additional flags>
3 | 
4 | python $1/multiproc.py --nproc_per_node 4 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.4 --warmup 5 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_8GPU.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 training in FP16 on 8 GPUs using 2048 batch size (256 per GPU)
2 | # Usage ./RN50_FP16_8GPU.sh <path to this repository> <additional flags>
3 | 
4 | python $1/multiproc.py --nproc_per_node 8 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 256 --lr 0.8 --warmup 5 --epochs 90 --fp16 --static-loss-scale 256 $2 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_EVAL.sh:
--------------------------------------------------------------------------------
1 | # This script evaluates ResNet50 model in FP16 using 64 batch size on 1 GPU
2 | # Usage: ./RN50_FP16_EVAL.sh <path to this repository> <path to checkpoint>
3 | 
4 | python $1/main.py -j5 p 100 --arch resnet50 -b 256 --resume $2 --evaluate --fp16 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP16_INFERENCE_BENCHMARK.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 256 batch size
2 | 
3 | python ./main.py -j5 --arch resnet50 -b 256 --fp16 --benchmark-inference /data/imagenet
4 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_1GPU.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 training in FP32 on 1 GPUs using 128 batch size (128 per GPU)
2 | # Usage ./RN50_FP32_1GPU.sh <path to this repository> <additional flags>
3 | 
4 | python $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.05 --epochs 90 $2 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_4GPU.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 training in FP32 on 4 GPUs using 512 batch size (128 per GPU)
2 | # Usage ./RN50_FP32_4GPU.sh <path to this repository> <additional flags>
3 | 
4 | python $1/multiproc.py --nproc_per_node 4 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.2 --warmup 5 --epochs 90 $2 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_8GPU.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU)
2 | # Usage ./RN50_FP32_8GPU.sh <path to this repository> <additional flags>
3 | 
4 | python $1/multiproc.py --nproc_per_node 8 $1/main.py -j5 -p 500 --arch resnet50 -c fanin --label-smoothing 0.1 -b 128 --lr 0.4 --warmup 5 --epochs 90 $2 /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_EVAL.sh:
--------------------------------------------------------------------------------
1 | # This script evaluates ResNet50 model in FP32 using 64 batch size on 1 GPU
2 | # Usage: ./RN50_FP32_EVAL.sh <path to this repository> <path to checkpoint>
3 | 
4 | python $1/main.py -j5 p 100 --arch resnet50 -b 128 --resume $2 --evaluate /data/imagenet
5 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/examples/RN50_FP32_INFERENCE_BENCHMARK.sh:
--------------------------------------------------------------------------------
1 | # This script launches ResNet50 inference benchmark in FP32 on 1 GPU with 128 batch size
2 | 
3 | python ./main.py -j5 --arch resnet50 -b 128 --benchmark-inference /data/imagenet
4 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/image_classification/__init__.py:
--------------------------------------------------------------------------------
1 | from . import logger
2 | from . import dataloaders
3 | from . import training
4 | from . import utils
5 | from . import mixup
6 | from . import resnet
7 | from . import smoothing
8 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/image_classification/smoothing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class LabelSmoothing(nn.Module):
 5 |     """
 6 |     NLL loss with label smoothing.
 7 |     """
 8 |     def __init__(self, smoothing=0.0):
 9 |         """
10 |         Constructor for the LabelSmoothing module.
11 | 
12 |         :param smoothing: label smoothing factor
13 |         """
14 |         super(LabelSmoothing, self).__init__()
15 |         self.confidence = 1.0 - smoothing
16 |         self.smoothing = smoothing
17 | 
18 |     def forward(self, x, target):
19 |         logprobs = torch.nn.functional.log_softmax(x, dim=-1)
20 | 
21 |         nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
22 |         nll_loss = nll_loss.squeeze(1)
23 |         smooth_loss = -logprobs.mean(dim=-1)
24 |         loss = self.confidence * nll_loss + self.smoothing * smooth_loss
25 |         return loss.mean()
26 | 
27 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/.gitkeep


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_loss.png


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top1.png


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/DGX2_250_top5.png


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_accuracy.png


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/training_loss.png


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/validation_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/img/validation_accuracy.png


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt.qdrep


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt_2g.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50_pyt_2g.qdrep


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/README.md


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP16_250E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP16_50E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 50 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP16_90E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 90 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP32_250E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 250 --mixup 0.2 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP32_50E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 50 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX1_RN50_FP32_90E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 8 ./main.py --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 90 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP16_250E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP16_50E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 50 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP16_90E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 256 --fp16 --static-loss-scale 128 --epochs 90 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP32_250E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 250 --mixup 0.2 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP32_50E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 50 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/resnet50v1.5/training/DGX2_RN50_FP32_90E.sh:
--------------------------------------------------------------------------------
1 | python ./multiproc.py --nproc_per_node 16 ./main.py --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --data-backend pytorch --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace $1 -b 128 --epochs 90 /data/imagenet


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/test.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/pytorch/RN50v1.5/test.qdrep


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/pytorch/nsys-nvtx.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | CODE_PATH="RN50v1.5"
 4 | DATASET_PATH="/raid/datasets/imagenet/raw-data/"
 5 | OUTPUT_NAME="resnet50_pyt"
 6 | 
 7 | # default profile
 8 | docker run --rm -ti --runtime=nvidia \
 9 |     -v $(pwd)/${CODE_PATH}:/workspace \
10 |     -v ${DATASET_PATH}:/imagenet \
11 |     nvcr.io/nvidia/pytorch:19.08-py3 \
12 |        nsys profile -t cuda,nvtx,cudnn,cublas -o ${OUTPUT_NAME} -f true -w true -y 60 -d 20 \
13 |        python /workspace/main.py --arch resnet50 -b 64 --fp16 /imagenet
14 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/baseline.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/baseline.qdrep


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/dllogger/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/dllogger/__init__.py


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from model import layers
16 | from model import blocks
17 | from model import resnet_v1_5


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/blocks/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | from model.blocks.conv2d_block import conv2d_block
20 | from model.blocks.resnet_bottleneck_block import bottleneck_block
21 | 
22 | __all__ = [
23 | 
24 |     # conv + bn + act block
25 |     'conv2d_block',
26 | 
27 |     # resnet blocks
28 |     'bottleneck_block'
29 | ]
30 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env python
16 | # -*- coding: utf-8 -*-
17 | 
18 | from model.layers.activation import relu
19 | from model.layers.activation import softmax
20 | from model.layers.activation import tanh
21 | 
22 | from model.layers.conv2d import conv2d
23 | 
24 | from model.layers.dense import dense
25 | 
26 | from model.layers.math_ops import reduce_mean
27 | 
28 | from model.layers.normalization import batch_norm
29 | 
30 | from model.layers.padding import pad
31 | 
32 | from model.layers.pooling import average_pooling2d
33 | from model.layers.pooling import max_pooling2d
34 | 
35 | __all__ = [
36 | 
37 |     # activation layers
38 |     'relu',
39 |     'softmax',
40 |     'tanh',
41 | 
42 |     # conv layers
43 |     'conv2d',
44 | 
45 |     # dense layers
46 |     'dense',
47 | 
48 |     # math_ops layers
49 |     'reduce_mean',
50 | 
51 |     # normalization layers
52 |     'batch_norm',
53 | 
54 |     # padding layers
55 |     'pad',
56 | 
57 |     # pooling layers
58 |     'average_pooling2d',
59 |     'max_pooling2d'
60 | ]
61 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env python
16 | # -*- coding: utf-8 -*-
17 | 
18 | import tensorflow as tf
19 | 
20 | __all__ = ['relu', 'softmax', 'tanh']
21 | 
22 | 
23 | def relu(inputs, name='relu'):
24 | 
25 |     net = tf.nn.relu(inputs, name=name)
26 | 
27 |     return net
28 | 
29 | 
30 | def softmax(inputs, axis=None, name="softmax"):
31 | 
32 |     net = tf.nn.softmax(
33 |         inputs,
34 |         axis=axis,
35 |         name=name,
36 |     )
37 | 
38 |     return net
39 | 
40 | 
41 | def tanh(inputs, name='tanh'):
42 | 
43 |     net = tf.math.tanh(inputs, name=name)
44 | 
45 |     return net
46 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/dense.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import tensorflow as tf
16 | 
17 | __all__ = ['dense']
18 | 
19 | 
20 | def dense(
21 |     inputs,
22 |     units,
23 |     use_bias=True,
24 |     trainable=True,
25 |     kernel_initializer=tf.variance_scaling_initializer(),
26 |     bias_initializer=tf.zeros_initializer()
27 | ):
28 | 
29 |     net = tf.layers.dense(
30 |         inputs,
31 |         units=units,
32 |         activation=None,
33 |         use_bias=use_bias,
34 |         kernel_initializer=kernel_initializer,
35 |         bias_initializer=bias_initializer,
36 |         trainable=trainable
37 |     )
38 | 
39 |     return net
40 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/math_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env python
16 | # -*- coding: utf-8 -*-
17 | 
18 | import tensorflow as tf
19 | 
20 | __all__ = ['reduce_mean']
21 | 
22 | 
23 | def reduce_mean(inputs, keepdims=None, data_format='channels_last', name='spatial_mean'):
24 | 
25 |     if data_format not in ['NHWC', 'NCHW']:
26 |         raise ValueError("Unknown data format: `%s` (accepted: ['NHWC', 'NCHW'])" % data_format)
27 | 
28 |     axes = [1, 2] if data_format == 'NHWC' else [2, 3]
29 | 
30 |     net = tf.math.reduce_mean(inputs, axis=axes, keepdims=keepdims, name=name)
31 | 
32 |     return net
33 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/model/layers/padding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/usr/bin/env python
16 | # -*- coding: utf-8 -*-
17 | 
18 | import tensorflow as tf
19 | 
20 | __all__ = ['pad']
21 | 
22 | 
23 | def pad(inputs, paddings, mode='CONSTANT', name='padding', constant_values=0):
24 | 
25 |     if mode.upper() not in ['CONSTANT', 'REFLECT', 'SYMMETRIC']:
26 |         raise ValueError("Unknown padding mode: `%s` (accepted: ['CONSTANT', 'REFLECT', 'SYMMETRIC'])" % mode)
27 | 
28 |     net = tf.pad(inputs, paddings=paddings, mode=mode, name=name, constant_values=constant_values)
29 | 
30 |     return net
31 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/requirements.txt


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/events.out.tfevents.1566195554.5b8c84c05f4e:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/events.out.tfevents.1566195554.5b8c84c05f4e


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-1000.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-1000.index


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-2000.data-00001-of-00002:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learn-CUDA-Programming/10bd9b59bfcc680d608ea1417da5089f73d91b11/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/results/model.ckpt-2000.data-00001-of-00002


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/runtime/__init__.py:
--------------------------------------------------------------------------------
1 | from runtime.runner import Runner


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_16GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP16 on 16 GPUs using 4096 batch size (256 per GPU)
16 | # Usage ./RN50_FP16_16GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | mpiexec --allow-run-as-root --bind-to socket -np 16 \
19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp  --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_1GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP16 on 1 GPUs using 256 batch size (256 per GPU)
16 | # Usage ./RN50_FP16_1GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp  --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_4GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU)
16 | # Usage ./RN50_FP16_4GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | mpiexec --allow-run-as-root --bind-to socket -np 4 \
19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp  --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_8GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP16 on 8 GPUs using 2048 batch size (256 per GPU)
16 | # Usage ./RN50_FP16_8GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | mpiexec --allow-run-as-root --bind-to socket -np 8 \
19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=256 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --use_tf_amp  --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP16_EVAL.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script evaluates ResNet50 model in FP16 using 256 batch size on 1 GPU
16 | # Usage: ./RN50_FP16_EVAL.sh <path to this repository> <path to dataset> <path to model directory>
17 | 
18 | python $1/main.py --mode=evaluate --data_dir=$2 --batch_size=256 --num_iter=1 --iter_unit=epoch --use_tf_amp --results_dir=$3
19 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_16GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP32 on 16 GPUs using 2048 batch size (128 per GPU)
16 | ## Usage ./RN50_FP32_16GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | mpiexec --allow-run-as-root --bind-to socket -np 16 \
19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_1GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP32 on 1 GPUs using 128 batch size (128 per GPU)
16 | # Usage ./RN50_FP32_1GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_4GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP32 on 4 GPUs using 512 batch size (128 per GPU)
16 | # Usage ./RN50_FP32_4GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | mpiexec --allow-run-as-root --bind-to socket -np 4 \
19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_8GPU.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script launches ResNet50 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU)
16 | ## Usage ./RN50_FP32_8GPU.sh <path to this repository> <path to dataset> <path to results directory>
17 | 
18 | mpiexec --allow-run-as-root --bind-to socket -np 8 \
19 | python $1/main.py --mode=train_and_evaluate --iter_unit=epoch --num_iter=50 --batch_size=128 --warmup_steps=100 --use_cosine_lr --label_smoothing 0.1 --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 --data_dir=$2 --results_dir=$3


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/RN50_FP32_EVAL.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script evaluates ResNet50 model in FP32 using 128 batch size on 1 GPU
16 | # Usage: ./RN50_FP32_EVAL.sh <path to this repository> <path to dataset> <path to model directory>
17 | 
18 | python $1/main.py --mode=evaluate --data_dir=$2 --batch_size=128 --num_iter=1 --iter_unit=epoch --results_dir=$3
19 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_inferbench_fp16.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp16.json --perf_args "use_tf_amp" --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 192 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp16.json --perf_args "use_tf_amp" "use_xla" --data_dir $1 --results_dir $2/xla
8 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_inferbench_fp32.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp32.json --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 96 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_trainbench_fp16.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp16.json  --data_dir $1 --perf_args "use_tf_amp" --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 32 64 128 192 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp16.json --perf_args "use_xla" "use_tf_amp" --data_dir $1 --results_dir $2/xla
8 | 
9 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX1V_trainbench_fp32.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp32.json --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 --bs 32 64 96 --baseline ./scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla
8 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_inferbench_fp16.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp16.json --perf_args "use_tf_amp" --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp16.json --perf_args "use_xla" "use_tf_amp" --data_dir $1 --results_dir $2/xla


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_inferbench_fp32.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp32.json --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode inference --bench-warmup 100 --bench-iterations 200 --ngpus 1 --bs 1 2 4 8 16 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_trainbench_fp16.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp16.json --perf_args "use_tf_amp" --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 64 128 256 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp16.json --perf_args "use_xla" "use_tf_amp" --data_dir $1 --results_dir $2/xla


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/DGX2_trainbench_fp32.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p /tmp/results
4 | 
5 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp32.json --data_dir $1 --results_dir $2
6 | 
7 | python ./scripts/benchmarking/benchmark.py --mode training --bench-warmup 200 --bench-iterations 500 --ngpus 1 4 8 16 --bs 32 64 128 --baseline ./scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp32.json --perf_args "use_xla" --data_dir $1 --results_dir $2/xla


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "16": {
 8 |                 "total_ips": 1300.0
 9 |             }, 
10 |             "32": {
11 |                 "total_ips": 1600.0
12 |             }, 
13 |             "1": {
14 |                 "total_ips": 160.0
15 |             }, 
16 |             "2": {
17 |                 "total_ips": 320.0
18 |             }, 
19 |             "64": {
20 |                 "total_ips": 1800.0
21 |             }, 
22 |             "4": {
23 |                 "total_ips": 550.0
24 |             }, 
25 |             "128": {
26 |                 "total_ips": 1950.0
27 |             }, 
28 |             "8": {
29 |                 "total_ips": 950.0
30 |             }, 
31 |             "256": {
32 |                 "total_ips": 2050.0
33 |             }
34 |         }
35 |     }, 
36 |     "model": "", 
37 |     "ngpus": [
38 |         1
39 |     ], 
40 |     "bs": [
41 |         1, 
42 |         2, 
43 |         4, 
44 |         8, 
45 |         16, 
46 |         32, 
47 |         64, 
48 |         128, 
49 |         256
50 |     ]
51 | }
52 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_infer_fp32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "16": {
 8 |                 "total_ips": 800.0
 9 |             }, 
10 |             "32": {
11 |                 "total_ips": 920.0
12 |             }, 
13 |             "1": {
14 |                 "total_ips": 150.0
15 |             }, 
16 |             "2": {
17 |                 "total_ips": 270.0
18 |             }, 
19 |             "64": {
20 |                 "total_ips": 1000.0
21 |             }, 
22 |             "4": {
23 |                 "total_ips": 450.0
24 |             }, 
25 |             "128": {
26 |                 "total_ips": 1075.0
27 |             }, 
28 |             "8": {
29 |                 "total_ips": 650.0
30 |             }
31 |         }
32 |     }, 
33 |     "model": "", 
34 |     "ngpus": [
35 |         1
36 |     ], 
37 |     "bs": [
38 |         1, 
39 |         2, 
40 |         4, 
41 |         8, 
42 |         16, 
43 |         32, 
44 |         64, 
45 |         128
46 |     ]
47 | }
48 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "64": {
 8 |                 "total_ips": 630.0
 9 |             }, 
10 |             "128": {
11 |                 "total_ips": 710.0
12 |             }, 
13 |             "256": {
14 |                 "total_ips": 750.0
15 |             }
16 |         }, 
17 |         "4": {
18 |             "64": {
19 |                 "total_ips": 2250.0
20 |             }, 
21 |             "128": {
22 |                 "total_ips": 2600.0
23 |             }, 
24 |             "256": {
25 |                 "total_ips": 2900.0
26 |             }
27 |         }, 
28 |         "8": {
29 |             "64": {
30 |                 "total_ips": 4500.0
31 |             }, 
32 |             "128": {
33 |                 "total_ips": 5300.0
34 |             }, 
35 |             "256": {
36 |                 "total_ips": 5800.0
37 |             }
38 |         }
39 |     }, 
40 |     "model": "", 
41 |     "ngpus": [
42 |         1,
43 |         4,
44 |         8
45 |     ], 
46 |     "bs": [
47 |         64, 
48 |         128, 
49 |         256
50 |     ]
51 | }
52 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX1V_RN50_tensorflow_train_fp32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "32": {
 8 |                 "total_ips": 300.0
 9 |             }, 
10 |             "64": {
11 |                 "total_ips": 330.0
12 |             }, 
13 |             "128": {
14 |                 "total_ips": 350.0
15 |             }
16 |         }, 
17 |         "4": {
18 |             "32": {
19 |                 "total_ips": 1050.0
20 |             }, 
21 |             "64": {
22 |                 "total_ips": 1250.0
23 |             }, 
24 |             "128": {
25 |                 "total_ips": 1350.0
26 |             }
27 |         }, 
28 |         "8": {
29 |             "32": {
30 |                 "total_ips": 2100.0
31 |             }, 
32 |             "64": {
33 |                 "total_ips": 2500.0
34 |             }, 
35 |             "128": {
36 |                 "total_ips": 2700.0
37 |             }
38 |         }
39 |     }, 
40 |     "model": "", 
41 |     "ngpus": [
42 |         1,
43 |         4,
44 |         8
45 |     ], 
46 |     "bs": [
47 |         32, 
48 |         64, 
49 |         128
50 |     ]
51 | }
52 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "16": {
 8 |                 "total_ips": 1300.0
 9 |             }, 
10 |             "32": {
11 |                 "total_ips": 1600.0
12 |             }, 
13 |             "1": {
14 |                 "total_ips": 160.0
15 |             }, 
16 |             "2": {
17 |                 "total_ips": 320.0
18 |             }, 
19 |             "64": {
20 |                 "total_ips": 1800.0
21 |             }, 
22 |             "4": {
23 |                 "total_ips": 550.0
24 |             }, 
25 |             "128": {
26 |                 "total_ips": 1950.0
27 |             }, 
28 |             "8": {
29 |                 "total_ips": 950.0
30 |             }, 
31 |             "256": {
32 |                 "total_ips": 2050.0
33 |             }
34 |         }
35 |     }, 
36 |     "model": "", 
37 |     "ngpus": [
38 |         1
39 |     ], 
40 |     "bs": [
41 |         1, 
42 |         2, 
43 |         4, 
44 |         8, 
45 |         16, 
46 |         32, 
47 |         64, 
48 |         128, 
49 |         256
50 |     ]
51 | }
52 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_infer_fp32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "16": {
 8 |                 "total_ips": 800.0
 9 |             }, 
10 |             "32": {
11 |                 "total_ips": 920.0
12 |             }, 
13 |             "1": {
14 |                 "total_ips": 150.0
15 |             }, 
16 |             "2": {
17 |                 "total_ips": 270.0
18 |             }, 
19 |             "64": {
20 |                 "total_ips": 1000.0
21 |             }, 
22 |             "4": {
23 |                 "total_ips": 450.0
24 |             }, 
25 |             "128": {
26 |                 "total_ips": 1075.0
27 |             }, 
28 |             "8": {
29 |                 "total_ips": 650.0
30 |             }
31 |         }
32 |     }, 
33 |     "model": "", 
34 |     "ngpus": [
35 |         1
36 |     ], 
37 |     "bs": [
38 |         1, 
39 |         2, 
40 |         4, 
41 |         8, 
42 |         16, 
43 |         32, 
44 |         64, 
45 |         128
46 |     ]
47 | }
48 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "64": {
 8 |                 "total_ips": 630.0
 9 |             }, 
10 |             "128": {
11 |                 "total_ips": 710.0
12 |             }, 
13 |             "256": {
14 |                 "total_ips": 750.0
15 |             }
16 |         }, 
17 |         "4": {
18 |             "64": {
19 |                 "total_ips": 2250.0
20 |             }, 
21 |             "128": {
22 |                 "total_ips": 2600.0
23 |             }, 
24 |             "256": {
25 |                 "total_ips": 2900.0
26 |             }
27 |         }, 
28 |         "8": {
29 |             "64": {
30 |                 "total_ips": 4650.0
31 |             }, 
32 |             "128": {
33 |                 "total_ips": 5500.0
34 |             }, 
35 |             "256": {
36 |                 "total_ips": 6000.0
37 |             }
38 |         },
39 |         "16": {
40 |             "64": {
41 |                 "total_ips": 9000.0
42 |             }, 
43 |             "128": {
44 |                 "total_ips": 10500.0
45 |             }, 
46 |             "256": {
47 |                 "total_ips": 11500.0
48 |             }
49 |         }
50 |     }, 
51 |     "model": "", 
52 |     "ngpus": [
53 |         1,
54 |         4,
55 |         8,
56 |         16
57 |     ], 
58 |     "bs": [
59 |         64, 
60 |         128, 
61 |         256
62 |     ]
63 | }
64 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/benchmarking/baselines/DGX2_RN50_tensorflow_train_fp32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metric_keys": [
 3 |         "total_ips"
 4 |     ], 
 5 |     "metrics": {
 6 |         "1": {
 7 |             "32": {
 8 |                 "total_ips": 300.0
 9 |             }, 
10 |             "64": {
11 |                 "total_ips": 330.0
12 |             }, 
13 |             "128": {
14 |                 "total_ips": 350.0
15 |             }
16 |         }, 
17 |         "4": {
18 |             "32": {
19 |                 "total_ips": 1050.0
20 |             }, 
21 |             "64": {
22 |                 "total_ips": 1250.0
23 |             }, 
24 |             "128": {
25 |                 "total_ips": 1350.0
26 |             }
27 |         }, 
28 |         "8": {
29 |             "32": {
30 |                 "total_ips": 2100.0
31 |             }, 
32 |             "64": {
33 |                 "total_ips": 2500.0
34 |             }, 
35 |             "128": {
36 |                 "total_ips": 2700.0
37 |             }
38 |         },
39 |          "16": {
40 |             "32": {
41 |                 "total_ips": 4100.0
42 |             }, 
43 |             "64": {
44 |                 "total_ips": 5100.0
45 |             }, 
46 |             "128": {
47 |                 "total_ips": 5500.0
48 |             }
49 |         }
50 |     }, 
51 |     "model": "", 
52 |     "ngpus": [
53 |         1,
54 |         4,
55 |         8,
56 |         16
57 |     ], 
58 |     "bs": [
59 |         32, 
60 |         64, 
61 |         128
62 |     ]
63 | }
64 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build . --rm -t rn50v15_tf
4 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/scripts/docker/interactive.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nvidia-docker run -it --rm --ipc=host --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -v $PWD:/workspace/rn50v15_tf/ rn50v15_tf bash
4 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 | 
19 | from utils import hooks
20 | 
21 | from utils import var_storage
22 | 
23 | from utils import cmdline_helper
24 | 
25 | from utils import data_utils
26 | from utils import image_processing
27 | 
28 | from utils import learning_rate
29 | 
30 | from utils import dali_utils
31 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | from utils.hooks.training_hooks import *
5 | from utils.hooks.benchmark_hooks import *
6 | from utils.hooks.prefill_hook import *
7 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/hooks/prefill_hook.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | import time
19 | 
20 | import numpy as np
21 | import tensorflow as tf
22 | 
23 | __all__ = ['PrefillStagingAreasHook']
24 | 
25 | 
26 | class PrefillStagingAreasHook(tf.train.SessionRunHook):
27 | 
28 |     def after_create_session(self, session, coord):
29 |         # TODO: This assumes TF collections are ordered; is this safe?
30 |         enqueue_ops = tf.get_collection('STAGING_AREA_PUTS')
31 |         for i in range(len(enqueue_ops)):
32 |             session.run(enqueue_ops[:i + 1])
33 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/RN50v1.5/utils/hvd_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | import os
19 | 
20 | __all__ = ["is_using_hvd"]
21 | 
22 | 
23 | def is_using_hvd():
24 |     env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
25 | 
26 |     if all([var in os.environ for var in env_vars]):
27 |         return True
28 |     else:
29 |         return False
30 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/nsys-nvtx-2g.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | CODE_PATH="RN50v1.5"
 4 | DATASET_PATH="/raid/datasets/imagenet/tfrecord"
 5 | OUTPUT_NAME="resnet50_tf"
 6 | 
 7 | # default profile
 8 | docker run --rm -ti --runtime=nvidia \
 9 |     -v $(pwd):/result \
10 |     -v $(pwd)/${CODE_PATH}:/workspace \
11 |     -v ${DATASET_PATH}:/imagenet \
12 |     --ipc host --net host \
13 |     nvcr.io/nvidia/tensorflow:19.08-py3 \
14 |         nsys profile -t cuda,nvtx,cudnn,cublas -o ${OUTPUT_NAME} -f true -w true -y 40 -d 20 \
15 |             mpiexec --allow-run-as-root --bind-to socket -np 2 \
16 |             python /workspace/main.py --data_dir=/imagenet --mode=training_benchmark --warmup_steps 200 \
17 |                 --num_iter 500 --iter_unit batch --results_dir=results --batch_size 64
18 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/04_framework_profile/tensorflow/nsys-nvtx.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | CODE_PATH="RN50v1.5"
 4 | DATASET_PATH="/raid/datasets/imagenet/tfrecord"
 5 | OUTPUT_NAME="resnet50_tf"
 6 | 
 7 | # default profile
 8 | docker run --rm -ti --runtime=nvidia \
 9 |     -v $(pwd):/result \
10 |     -v $(pwd)/${CODE_PATH}:/workspace \
11 |     -v ${DATASET_PATH}:/imagenet \
12 |     nvcr.io/nvidia/tensorflow:19.08-py3 \
13 |         nsys profile -t cuda,nvtx,cudnn,cublas -o ${OUTPUT_NAME} -f true -w true -y 40 -d 20 \
14 |             python /workspace/main.py --data_dir=/imagenet --mode=training_benchmark --warmup_steps 200 \
15 |                 --num_iter 500 --iter_unit batch --results_dir=results --batch_size 64
16 | 


--------------------------------------------------------------------------------
/Chapter10/10_deep_learning/Makefile:
--------------------------------------------------------------------------------
 1 | # Project folders that contain CUDA receipts
 2 | PROJECTS ?= $(shell find \
 3 | 	$(shell ls -d */) \
 4 | 	-name Makefile)
 5 | 	
 6 | %.ph_build:
 7 | 	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
 8 | 
 9 | %.ph_clean:
10 | 	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
11 | 
12 | all:  $(addsuffix .ph_build,$(PROJECTS))
13 | 	@echo "Finished building CUDA Receipts"
14 | 
15 | build: $(addsuffix .ph_build,$(PROJECTS))
16 | 
17 | tidy:
18 | 	@find * | egrep "#" | xargs rm -f
19 | 	@find * | egrep "\~" | xargs rm -f
20 | 	@find * | egrep "nvvp" | xargs rm -f
21 | 
22 | .PHONY: clean
23 | clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
24 | 
25 | test:
26 | 	echo $(DIRECTORY)
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------