├── slurm_pmi_config ├── lib │ └── .gitkeep └── include │ ├── smd_ns.h │ └── slurm_errno.h ├── labs └── CFD │ ├── English │ ├── C │ │ ├── source_code │ │ │ ├── mpi │ │ │ │ ├── .gitkeep │ │ │ │ ├── hello_world.c │ │ │ │ ├── containerization │ │ │ │ │ ├── Makefile │ │ │ │ │ └── jacobi_kernels.cu │ │ │ │ ├── Makefile │ │ │ │ └── jacobi_kernels.cu │ │ │ ├── nccl │ │ │ │ ├── .gitkeep │ │ │ │ ├── Makefile │ │ │ │ └── jacobi_kernels.cu │ │ │ ├── nvshmem │ │ │ │ ├── .gitkeep │ │ │ │ ├── Makefile │ │ │ │ └── left_shift.cu │ │ │ ├── p2pBandwidthLatencyTest │ │ │ │ ├── Common │ │ │ │ │ ├── GL │ │ │ │ │ │ ├── freeglut.h │ │ │ │ │ │ └── freeglut_ext.h │ │ │ │ │ ├── rendercheck_d3d11.h │ │ │ │ │ ├── UtilNPP │ │ │ │ │ │ ├── SignalAllocatorsCPU.h │ │ │ │ │ │ ├── ImageAllocatorsCPU.h │ │ │ │ │ │ ├── Pixel.h │ │ │ │ │ │ ├── SignalsCPU.h │ │ │ │ │ │ ├── SignalsNPP.h │ │ │ │ │ │ ├── ImagesCPU.h │ │ │ │ │ │ ├── Image.h │ │ │ │ │ │ ├── Signal.h │ │ │ │ │ │ ├── ImageIO.h │ │ │ │ │ │ ├── ImagePacked.h │ │ │ │ │ │ ├── ImagesNPP.h │ │ │ │ │ │ └── Exceptions.h │ │ │ │ │ ├── helper_functions.h │ │ │ │ │ ├── helper_multiprocess.h │ │ │ │ │ ├── rendercheck_d3d11.cpp │ │ │ │ │ ├── helper_cusolver.h │ │ │ │ │ ├── exception.h │ │ │ │ │ ├── dynlink_d3d11.h │ │ │ │ │ └── nvrtc_helper.h │ │ │ │ └── Makefile │ │ │ ├── cuda │ │ │ │ └── Makefile │ │ │ └── single_gpu │ │ │ │ ├── Makefile │ │ │ │ └── jacobi.cu │ │ ├── jupyter_notebook │ │ │ ├── mpi │ │ │ │ ├── .gitkeep │ │ │ │ └── multi_node_intro.ipynb │ │ │ ├── nccl │ │ │ │ └── .gitkeep │ │ │ └── nvhsmem │ │ │ │ └── .gitkeep │ │ └── images │ │ │ ├── jacobi_algo.jpg │ │ │ ├── git_branching.jpg │ │ │ ├── gpudirect_p2p.png │ │ │ ├── gpudirect_rdma.png │ │ │ ├── halo_exchange.png │ │ │ ├── mpi_overview.png │ │ │ ├── nsys_overview.png │ │ │ ├── memcpy_gpu_util.png │ │ │ ├── memcpy_host_staging.png │ │ │ ├── memcpy_p2p_overview.png │ │ │ ├── memcpy_serialized.png │ │ │ ├── mpi_container_setup.png │ │ │ ├── mpi_memcpy_overview.png │ │ │ ├── nccl_architecture.png │ │ │ ├── nccl_dgx1_topology.png │ │ │ ├── nvidia_smi_p2p_gpu0.png │ │ │ ├── cuda_streams_overview.png │ │ │ ├── domain_decomposition.png │ │ │ ├── memcpy_util_selection.png │ │ │ ├── memcpyasync_parallel.png │ │ │ ├── mpi_host_staging_time.png │ │ │ ├── mpi_memcpy_large_time.png │ │ │ ├── mpi_memcpy_nvtx_stats.png │ │ │ ├── nccl_profiler_output.png │ │ │ ├── nvshmem_memory_model.png │ │ │ ├── open_terminal_session.png │ │ │ ├── p2p_2_gpu_memcpy_nsys.png │ │ │ ├── dgx1_8x_tesla_v100_topo.png │ │ │ ├── gpu_programming_process.png │ │ │ ├── intra_node_topology_map.png │ │ │ ├── jacobi_memcpy_p2p_report.png │ │ │ ├── jupyter_lab_navigation.png │ │ │ ├── nsys_cli_sample_output.png │ │ │ ├── nsys_single_gpu_analysis.png │ │ │ ├── nvidia_smi_topo_output.png │ │ │ ├── nvshmem_mpi_comparison.png │ │ │ ├── nvshmem_profiler_report.png │ │ │ ├── streams_util_selection.png │ │ │ ├── mpi_cuda_aware_gdr_latency.png │ │ │ ├── mpi_cuda_aware_p2p_metrics.png │ │ │ ├── nvshmem_left_shift_output.png │ │ │ ├── nvshmem_thread_level_comm.png │ │ │ ├── jacobi_memcpy_report_events.png │ │ │ ├── jacobi_memcpy_report_overview.png │ │ │ ├── mpi_memcpy_halo_exchange_latency.png │ │ │ ├── mpi_cuda_aware_halo_exchange_latency.png │ │ │ ├── mpi_host_staging_throughput_latency.png │ │ │ └── jacobi_memcpy_streams_events_p2p_report.png │ ├── Presentations │ │ └── README.md │ └── start_here.ipynb │ └── LICENSE ├── .gitignore ├── Singularity ├── CONTRIBUTING.md └── README.md /slurm_pmi_config/lib/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/mpi/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/jupyter_notebook/mpi/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/nccl/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/nvshmem/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/jupyter_notebook/nccl/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/jupyter_notebook/nvhsmem/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labs/CFD/English/C/images/jacobi_algo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_algo.jpg -------------------------------------------------------------------------------- /labs/CFD/English/C/images/git_branching.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/git_branching.jpg -------------------------------------------------------------------------------- /labs/CFD/English/C/images/gpudirect_p2p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/gpudirect_p2p.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/gpudirect_rdma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/gpudirect_rdma.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/halo_exchange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/halo_exchange.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_overview.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nsys_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nsys_overview.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/memcpy_gpu_util.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_gpu_util.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/memcpy_host_staging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_host_staging.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/memcpy_p2p_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_p2p_overview.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/memcpy_serialized.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_serialized.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_container_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_container_setup.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_memcpy_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_overview.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nccl_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nccl_architecture.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nccl_dgx1_topology.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nccl_dgx1_topology.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/cuda_streams_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/cuda_streams_overview.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/domain_decomposition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/domain_decomposition.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/memcpy_util_selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_util_selection.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/memcpyasync_parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpyasync_parallel.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_host_staging_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_host_staging_time.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_memcpy_large_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_large_time.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nccl_profiler_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nccl_profiler_output.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvshmem_memory_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_memory_model.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/open_terminal_session.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/open_terminal_session.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/gpu_programming_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/gpu_programming_process.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/intra_node_topology_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/intra_node_topology_map.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/jupyter_lab_navigation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jupyter_lab_navigation.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nsys_cli_sample_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nsys_cli_sample_output.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nsys_single_gpu_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nsys_single_gpu_analysis.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvidia_smi_topo_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvidia_smi_topo_output.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvshmem_mpi_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_mpi_comparison.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvshmem_profiler_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_profiler_report.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/streams_util_selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/streams_util_selection.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvshmem_left_shift_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_left_shift_output.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/nvshmem_thread_level_comm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_thread_level_comm.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | */.ipynb_checkpoints/* 3 | alk.traj.dcd 4 | *.simg 5 | *.so* 6 | *.a 7 | *.la 8 | mgpm 9 | *.o 10 | *.out 11 | */.ses/* 12 | */.log/* 13 | 14 | -------------------------------------------------------------------------------- /labs/CFD/English/C/images/jacobi_memcpy_report_events.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_report_events.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/jacobi_memcpy_report_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png -------------------------------------------------------------------------------- /labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png -------------------------------------------------------------------------------- /labs/CFD/English/Presentations/README.md: -------------------------------------------------------------------------------- 1 | For Partners who are interested in delivering the critical hands-on skills needed to advance science in form of Bootcamp can reach out to us at [GPU Hackathon Partner](https://gpuhackathons.org/partners) website. In addition to current bootcamp material the Partners will be provided with the following: 2 | 3 | - Presentation: All the Bootcamps are accompanied with training material presentations which can be used during the Bootcamp session. 4 | - Mini challenge : To test the knowledge gained during this Bootcamp a mini application challenge is provided along with sample Solution. 5 | - Additional Support: On case to case basis the Partners can also be trained on how to effectively deliver the Bootcamp with maximal impact. -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut.h: -------------------------------------------------------------------------------- 1 | #ifndef __FREEGLUT_H__ 2 | #define __FREEGLUT_H__ 3 | 4 | /* 5 | * freeglut.h 6 | * 7 | * The freeglut library include file 8 | * 9 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 10 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 11 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 12 | * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | */ 16 | 17 | #include "freeglut_std.h" 18 | #include "freeglut_ext.h" 19 | 20 | /*** END OF FILE ***/ 21 | 22 | #endif /* __FREEGLUT_H__ */ 23 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/mpi/hello_world.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | // Initialize the MPI environment 6 | MPI_Init(NULL, NULL); 7 | 8 | // Get the number of processes 9 | int size; 10 | MPI_Comm_size(MPI_COMM_WORLD, &size); 11 | 12 | // Get the rank of the process 13 | int rank; 14 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 15 | 16 | // Get the name of the processor 17 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 18 | int name_len; 19 | MPI_Get_processor_name(processor_name, &name_len); 20 | 21 | // Print a hello world message 22 | printf("Hello world from processor %s, rank %d out of %d processors\n", 23 | processor_name, rank, size); 24 | 25 | // Finalize the MPI environment. 26 | MPI_Finalize(); 27 | } 28 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/nccl/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | MPICXX=mpicxx 4 | MPIRUN ?= mpirun 5 | #CUDA_HOME ?= /usr/local/cuda 6 | #NCCL_HOME ?= /usr/nccl/ 7 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 8 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 9 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 10 | 11 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 12 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -fopenmp -std=c++14 13 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl 14 | 15 | jacobi_nccl: Makefile jacobi_nccl.cpp jacobi_kernels.o 16 | $(MPICXX) $(MPICXX_FLAGS) jacobi_nccl.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_nccl 17 | 18 | jacobi_kernels.o: Makefile jacobi_kernels.cu 19 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 20 | 21 | .PHONY.: clean 22 | clean: 23 | rm -rf jacobi_nccl jacobi_kernels.o *.qdrep *.sqlite 24 | 25 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/mpi/containerization/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | MPICXX=mpicxx 4 | #CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ 5 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 6 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 7 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 8 | 9 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 10 | MPICXX_FLAGS = -g -I$(CUDA_HOME)/include -fopenmp -std=c++14 11 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt 12 | 13 | jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o 14 | $(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi 15 | 16 | jacobi_kernels.o: Makefile jacobi_kernels.cu 17 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 18 | 19 | all: jacobi_cuda_aware_mpi 20 | 21 | .PHONY.: clean 22 | clean: 23 | rm -rf jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite 24 | 25 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/cuda/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ 4 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 5 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 6 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 7 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 8 | 9 | jacobi_memcpy: jacobi_memcpy.cu 10 | $(NVCC) $(NVCC_FLAGS) jacobi_memcpy.cu -o jacobi_memcpy 11 | 12 | jacobi_streams: jacobi_streams.cu 13 | $(NVCC) $(NVCC_FLAGS) jacobi_streams.cu -o jacobi_streams 14 | 15 | jacobi_streams_events: jacobi_streams_events.cu 16 | $(NVCC) $(NVCC_FLAGS) jacobi_streams_events.cu -o jacobi_streams_events 17 | 18 | all: jacobi_memcpy jacobi_streams jacobi_streams_events 19 | 20 | .PHONY: clean 21 | clean: 22 | rm -f jacobi_memcpy jacobi_streams jacobi_streams_events *.qdrep *.sqlite 23 | 24 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/nvshmem/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPIRUN ?= mpirun 5 | CUDA_HOME ?= /usr/local/cuda 6 | ifndef NVSHMEM_HOME 7 | $(error NVSHMEM_HOME is not set) 8 | endif 9 | ifndef MPI_HOME 10 | $(error MPI_HOME is not set) 11 | endif 12 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 13 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 14 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 15 | 16 | NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include 17 | NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt 18 | 19 | left_shift: Makefile left_shift.cu 20 | $(NVCC) $(NVCC_FLAGS) left_shift.cu -c -o left_shift.o 21 | $(NVCC) $(GENCODE_FLAGS) left_shift.o -o left_shift $(NVCC_LDFLAGS) 22 | 23 | jacobi_nvshmem: Makefile jacobi_nvshmem.cu 24 | $(NVCC) $(NVCC_FLAGS) jacobi_nvshmem.cu -c -o jacobi_nvshmem.o 25 | $(NVCC) $(GENCODE_FLAGS) jacobi_nvshmem.o -o jacobi_nvshmem $(NVCC_LDFLAGS) 26 | 27 | .PHONY.: clean 28 | clean: 29 | rm -rf jacobi_nvshmem left_shift *.o *.qdrep *.sqlite 30 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/single_gpu/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | #CUDA_HOME=hpc_sdk_path/Linux_x86_64/21.3/cuda/11.2/ 4 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 5 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 6 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 7 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 8 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 9 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 10 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 11 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.qdrep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/mpi/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | MPICXX=mpicxx 4 | #CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ 5 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 6 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 7 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 8 | 9 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 10 | MPICXX_FLAGS = -g -I$(CUDA_HOME)/include -fopenmp -std=c++14 11 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt 12 | 13 | hello_world: Makefile hello_world.c 14 | $(MPICXX) $(MPICXX_FLAGS) hello_world.c $(LD_FLAGS) -o hello_world 15 | 16 | jacobi_memcpy_mpi: Makefile jacobi_memcpy_mpi.cpp jacobi_kernels.o 17 | $(MPICXX) $(MPICXX_FLAGS) jacobi_memcpy_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_memcpy_mpi 18 | 19 | jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o 20 | $(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi 21 | 22 | jacobi_kernels.o: Makefile jacobi_kernels.cu 23 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 24 | 25 | all: hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi 26 | 27 | .PHONY.: clean 28 | clean: 29 | rm -rf hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite 30 | 31 | -------------------------------------------------------------------------------- /labs/CFD/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/nvshmem/left_shift.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "mpi.h" 3 | #include "nvshmem.h" 4 | #include "nvshmemx.h" 5 | 6 | #define CUDA_CHECK(stmt) \ 7 | do { \ 8 | cudaError_t result = (stmt); \ 9 | if (cudaSuccess != result) { \ 10 | fprintf(stderr, "[%s:%d] CUDA failed with %s \n", \ 11 | __FILE__, __LINE__, cudaGetErrorString(result)); \ 12 | exit(-1); \ 13 | } \ 14 | } while (0) 15 | 16 | __global__ void simple_shift(int *destination) { 17 | int mype = nvshmem_my_pe(); 18 | int npes = nvshmem_n_pes(); 19 | int peer = (mype + 1) % npes; 20 | 21 | nvshmem_int_p(destination, mype, peer); 22 | } 23 | 24 | int main (int argc, char *argv[]) { 25 | int mype_node, msg; 26 | cudaStream_t stream; 27 | int rank, nranks; 28 | MPI_Comm mpi_comm = MPI_COMM_WORLD; 29 | nvshmemx_init_attr_t attr; 30 | 31 | MPI_Init(&argc, &argv); 32 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 33 | MPI_Comm_size(MPI_COMM_WORLD, &nranks); 34 | 35 | attr.mpi_comm = &mpi_comm; 36 | nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); 37 | mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 38 | 39 | CUDA_CHECK(cudaSetDevice(mype_node)); 40 | CUDA_CHECK(cudaStreamCreate(&stream)); 41 | int *destination = (int *) nvshmem_malloc (sizeof(int)); 42 | 43 | simple_shift<<<1, 1, 0, stream>>>(destination); 44 | nvshmemx_barrier_all_on_stream(stream); 45 | CUDA_CHECK(cudaMemcpyAsync(&msg, destination, sizeof(int), 46 | cudaMemcpyDeviceToHost, stream)); 47 | 48 | CUDA_CHECK(cudaStreamSynchronize(stream)); 49 | printf("%d: received message %d\n", nvshmem_my_pe(), msg); 50 | 51 | nvshmem_free(destination); 52 | nvshmem_finalize(); 53 | MPI_Finalize(); 54 | return 0; 55 | } -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #pragma once 29 | 30 | #ifndef _RENDERCHECK_D3D11_H_ 31 | #define _RENDERCHECK_D3D11_H_ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | class CheckRenderD3D11 40 | { 41 | public: 42 | 43 | CheckRenderD3D11() {} 44 | 45 | static HRESULT ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName); 46 | static HRESULT ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName); 47 | 48 | static bool PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path, 49 | const float epsilon, const float threshold = 0.0f); 50 | }; 51 | 52 | #endif -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalAllocatorsCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H 30 | #define NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H 31 | 32 | #include "Exceptions.h" 33 | 34 | namespace npp 35 | { 36 | 37 | template 38 | class SignalAllocatorCPU 39 | { 40 | public: 41 | static 42 | D * 43 | Malloc1D(unsigned int nSize) 44 | { 45 | return new D[nSize];; 46 | }; 47 | 48 | static 49 | void 50 | Free1D(D *pPixels) 51 | { 52 | delete[] pPixels; 53 | }; 54 | 55 | static 56 | void 57 | Copy1D(D *pDst, const D *pSrc, size_t nSize) 58 | { 59 | memcpy(pDst, pSrc, nSize * sizeof(D)); 60 | }; 61 | 62 | }; 63 | 64 | } // npp namespace 65 | 66 | #endif // NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H 67 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_functions.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | // These are helper functions for the SDK samples (string parsing, 29 | // timers, image helpers, etc) 30 | #ifndef COMMON_HELPER_FUNCTIONS_H_ 31 | #define COMMON_HELPER_FUNCTIONS_H_ 32 | 33 | #ifdef WIN32 34 | #pragma warning(disable : 4996) 35 | #endif 36 | 37 | // includes, project 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | // includes, timer, string parsing, image helpers 51 | #include // helper functions for image compare, dump, data comparisons 52 | #include // helper functions for string parsing 53 | #include // helper functions for timers 54 | 55 | #ifndef EXIT_WAIVED 56 | #define EXIT_WAIVED 2 57 | #endif 58 | 59 | #endif // COMMON_HELPER_FUNCTIONS_H_ 60 | -------------------------------------------------------------------------------- /Singularity: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All rights reserved. 2 | 3 | Bootstrap: docker 4 | FROM: nvcr.io/nvidia/nvhpc:21.5-devel-cuda_multi-ubuntu20.04 5 | 6 | %environment 7 | export XDG_RUNTIME_DIR= 8 | export PATH="/opt/openmpi/ompi/bin/:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH" 9 | export LD_LIBRARY_PATH="/opt/openmpi/ompi/lib:/pmi_utils/lib/:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/lib64/:$LD_LIBRARY_PATH" 10 | 11 | %post 12 | build_tmp=$(mktemp -d) && cd ${build_tmp} 13 | 14 | apt-get -y update 15 | apt-get -y dist-upgrade 16 | DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \ 17 | m4 vim-nox emacs-nox nano zip\ 18 | python3-pip python3-setuptools git-core inotify-tools \ 19 | curl git-lfs \ 20 | build-essential libtbb-dev 21 | rm -rf /var/lib/apt/cache/* 22 | 23 | pip3 install --upgrade pip 24 | pip3 install --no-cache-dir jupyter 25 | pip3 install --no-cache-dir jupyterlab 26 | pip3 install gdown 27 | 28 | apt-get install --no-install-recommends -y build-essential 29 | 30 | # NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2 31 | apt-get update -y 32 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget 33 | apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 34 | echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list 35 | apt-get update -y 36 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 37 | apt-get install --no-install-recommends -y build-essential 38 | 39 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 40 | bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3 41 | rm Miniconda3-latest-Linux-x86_64.sh 42 | 43 | # Install CUDA-aware OpenMPI with UCX and PMI 44 | mkdir -p /opt/openmpi && cd /opt/openmpi 45 | wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz 46 | tar -xvzf openmpi-4.1.1.tar.gz 47 | mkdir -p /opt/openmpi/ompi/ 48 | cd /opt/openmpi/openmpi-4.1.1/ 49 | ./configure --prefix=/opt/openmpi/ompi/ --with-libevent=internal --with-xpmem --with-cuda=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ --with-slurm --with-pmix=internal --with-pmi=/pmi_utils/ --enable-mpi1-compatibility --with-verbs --with-hcoll=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/hcoll/ --with-ucx=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/ucx/ 50 | export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/pmi_utils/lib/" 51 | make all install 52 | 53 | cd / 54 | rm -rf ${build_tmp} 55 | 56 | %files 57 | labs/ /labs 58 | slurm_pmi_config/ /pmi_utils 59 | 60 | %runscript 61 | "$@" 62 | 63 | %labels 64 | AUTHOR Anish-Saxena 65 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | # 28 | ################################################################################ 29 | # 30 | # Makefile project only supported on Mac OS X and Linux Platforms) 31 | # 32 | ################################################################################ 33 | 34 | # Location of the CUDA Toolkit 35 | 36 | HOST_COMPILER ?= g++ 37 | NVCC := nvcc -ccbin $(HOST_COMPILER) 38 | 39 | # internal flags 40 | NVCCFLAGS := 41 | CCFLAGS := 42 | LDFLAGS := 43 | 44 | SAMPLE_ENABLED := 1 45 | 46 | # Common includes and paths for CUDA 47 | INCLUDES := -I./Common 48 | LIBRARIES := 49 | 50 | ################################################################################ 51 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 52 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 53 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 54 | 55 | NVCC_FLAGS += -std=c++14 56 | LD_FLAGS += -lcudart 57 | ################################################################################ 58 | 59 | # Target rules 60 | all: build 61 | 62 | build: p2pBandwidthLatencyTest 63 | 64 | p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu 65 | $(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ -c $< 66 | 67 | p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o 68 | $(NVCC) $(LD_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 69 | 70 | clean: 71 | rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o 72 | 73 | clobber: clean 74 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageAllocatorsCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H 29 | #define NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H 30 | 31 | #include "Exceptions.h" 32 | 33 | namespace npp 34 | { 35 | 36 | template 37 | class ImageAllocatorCPU 38 | { 39 | public: 40 | static 41 | D * 42 | Malloc2D(unsigned int nWidth, unsigned int nHeight, unsigned int *pPitch) 43 | { 44 | NPP_ASSERT(nWidth * nHeight > 0); 45 | 46 | D *pResult = new D[nWidth * N * nHeight]; 47 | *pPitch = nWidth * sizeof(D) * N; 48 | 49 | return pResult; 50 | }; 51 | 52 | static 53 | void 54 | Free2D(D *pPixels) 55 | { 56 | delete[] pPixels; 57 | }; 58 | 59 | static 60 | void 61 | Copy2D(D *pDst, size_t nDstPitch, const D *pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight) 62 | { 63 | const void *pSrcLine = pSrc; 64 | void *pDstLine = pDst; 65 | 66 | for (size_t iLine = 0; iLine < nHeight; ++iLine) 67 | { 68 | // copy one line worth of data 69 | memcpy(pDst, pSrc, nWidth * N * sizeof(D)); 70 | // move data pointers to next line 71 | pDst += nDstPitch; 72 | pSrc += nSrcPitch; 73 | } 74 | }; 75 | 76 | }; 77 | 78 | } // npp namespace 79 | 80 | #endif // NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H 81 | -------------------------------------------------------------------------------- /slurm_pmi_config/include/smd_ns.h: -------------------------------------------------------------------------------- 1 | /*****************************************************************************\ 2 | * smd_ns.h - Library for fault tolerant application support 3 | ***************************************************************************** 4 | * Copyright (C) 2013-2014 SchedMD LLC 5 | * Written by Morris Jette and David Bigagli (SchedMD LLC) 6 | * 7 | * This file is part of Slurm, a resource management program. 8 | * For details, see . 9 | * Please also read the included file: DISCLAIMER. 10 | * 11 | * Slurm is free software; you can redistribute it and/or modify it under 12 | * the terms of the GNU General Public License as published by the Free 13 | * Software Foundation; either version 2 of the License, or (at your option) 14 | * any later version. 15 | * 16 | * In addition, as a special exception, the copyright holders give permission 17 | * to link the code of portions of this program with the OpenSSL library under 18 | * certain conditions as described in each individual source file, and 19 | * distribute linked combinations including the two. You must obey the GNU 20 | * General Public License in all respects for all of the code used other than 21 | * OpenSSL. If you modify file(s) with this exception, you may extend this 22 | * exception to your version of the file(s), but you are not obligated to do 23 | * so. If you do not wish to do so, delete this exception statement from your 24 | * version. If you delete this exception statement from all source files in 25 | * the program, then also delete it here. 26 | * 27 | * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 28 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 29 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 30 | * details. 31 | * 32 | * You should have received a copy of the GNU General Public License along 33 | * with Slurm; if not, write to the Free Software Foundation, Inc., 34 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 35 | \*****************************************************************************/ 36 | 37 | #ifndef _HAVE_SMD_NS_H 38 | #define _HAVE_SMD_NS_H 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | /* Faulty can be in state FAILED or FAILING 65 | * these flags tell the controller which one 66 | * the caller is interested in. 67 | */ 68 | #define FAILED_NODES (1 << 1) 69 | #define FAILING_NODES (1 << 2) 70 | 71 | /* These are the events sent from slurm to the client that 72 | * has registered for any of these events. 73 | * We use define as user can subscribe to more than one 74 | * events. 75 | */ 76 | #define SMD_EVENT_NODE_FAILED (1 << 1) /* node has failed */ 77 | #define SMD_EVENT_NODE_FAILING (1 << 2) /* node failing can be drained */ 78 | #define SMD_EVENT_NODE_REPLACE (1 << 3) /* replacement ready */ 79 | 80 | #endif /* _HAVE_SMD_NS_H */ 81 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Pixel.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_PIXEL_H 30 | #define NV_UTIL_PIXEL_H 31 | 32 | #include "Exceptions.h" 33 | 34 | namespace npp 35 | { 36 | template 37 | struct Pixel 38 | { }; 39 | 40 | template 41 | struct Pixel 42 | { 43 | D x; 44 | 45 | const D & 46 | operator[](size_t iChannel) 47 | const 48 | { 49 | NPP_ASSERT(iChannel < 1); 50 | return (&x)[iChannel]; 51 | } 52 | 53 | D & 54 | operator[](size_t iChannel) 55 | { 56 | NPP_ASSERT(iChannel < 1); 57 | return (&x)[iChannel]; 58 | } 59 | }; 60 | 61 | template 62 | struct Pixel 63 | { 64 | D x,y; 65 | 66 | const D & 67 | operator[](size_t iChannel) 68 | const 69 | { 70 | NPP_ASSERT(iChannel < 2); 71 | return (&x)[iChannel]; 72 | } 73 | 74 | D & 75 | operator[](size_t iChannel) 76 | { 77 | NPP_ASSERT(iChannel < 2); 78 | return (&x)[iChannel]; 79 | } 80 | }; 81 | 82 | template 83 | struct Pixel 84 | { 85 | D x,y,z; 86 | 87 | const D & 88 | operator[](size_t iChannel) 89 | const 90 | { 91 | NPP_ASSERT(iChannel < 3); 92 | return (&x)[iChannel]; 93 | } 94 | 95 | D & 96 | operator[](size_t iChannel) 97 | { 98 | NPP_ASSERT(iChannel < 3); 99 | return (&x)[iChannel]; 100 | } 101 | }; 102 | 103 | template 104 | struct Pixel 105 | { 106 | D x, y, z, w; 107 | 108 | const D & 109 | operator[](size_t iChannel) 110 | const 111 | { 112 | NPP_ASSERT(iChannel < 4); 113 | return (&x)[iChannel]; 114 | } 115 | 116 | D & 117 | operator[](size_t iChannel) 118 | { 119 | NPP_ASSERT(iChannel < 4); 120 | return (&x)[iChannel]; 121 | } 122 | }; 123 | 124 | } // npp namespace 125 | 126 | #endif // NV_UTIL_PIXEL_H 127 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNALS_CPU_H 30 | #define NV_UTIL_NPP_SIGNALS_CPU_H 31 | 32 | #include "Signal.h" 33 | 34 | #include "SignalAllocatorsCPU.h" 35 | #include "Exceptions.h" 36 | 37 | #include 38 | 39 | 40 | namespace npp 41 | { 42 | 43 | template 44 | class SignalCPU: public npp::SignalTemplate 45 | { 46 | public: 47 | typedef typename npp::SignalTemplate::tData tData; 48 | 49 | SignalCPU() 50 | { 51 | ; 52 | } 53 | 54 | SignalCPU(size_t nSize): SignalTemplate(nSize) 55 | { 56 | ; 57 | } 58 | 59 | SignalCPU(const SignalCPU &rSignal): SignalTemplate(rSignal) 60 | { 61 | ; 62 | } 63 | 64 | virtual 65 | ~SignalCPU() 66 | { 67 | ; 68 | } 69 | 70 | SignalCPU & 71 | operator= (const SignalCPU &rSignal) 72 | { 73 | SignalTemplate::operator= (rSignal); 74 | 75 | return *this; 76 | } 77 | 78 | tData & 79 | operator [](unsigned int i) 80 | { 81 | return *SignalTemplate::values(i); 82 | } 83 | 84 | tData 85 | operator [](unsigned int i) 86 | const 87 | { 88 | return *SignalTemplate::values(i); 89 | } 90 | 91 | }; 92 | 93 | typedef SignalCPU > SignalCPU_8u; 94 | typedef SignalCPU > SignalCPU_32s; 95 | typedef SignalCPU > SignalCPU_16s; 96 | typedef SignalCPU > SignalCPU_16sc; 97 | typedef SignalCPU > SignalCPU_32sc; 98 | typedef SignalCPU > SignalCPU_32f; 99 | typedef SignalCPU > SignalCPU_32fc; 100 | typedef SignalCPU > SignalCPU_64s; 101 | typedef SignalCPU > SignalCPU_64sc; 102 | typedef SignalCPU > SignalCPU_64f; 103 | typedef SignalCPU > SignalCPU_64fc; 104 | 105 | } // npp namespace 106 | 107 | #endif // NV_UTIL_NPP_SIGNALS_CPU_H 108 | -------------------------------------------------------------------------------- /labs/CFD/English/start_here.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Multi-GPU Programming and Performance Analysis\n", 8 | "\n", 9 | "## Learning objectives\n", 10 | "\n", 11 | "Scaling applications to multiple GPUs across multiple nodes requires one to be adept at not just the programming models and optimization techniques, but also at performing root-cause analysis using in-depth profiling to identify and minimize bottlenecks. In this bootcamp, participants will learn to improve the performance of an application step-by-step, taking cues from profilers along the way. Moreover, understanding of the underlying technologies and communication topology will help us utilize high-performance NVIDIA libraries to extract more performance out of the system.\n", 12 | "\n", 13 | "By the end of this bootcamp session, participants will be adept at:\n", 14 | "* Reviewing communication architecture and topology\n", 15 | "* Developing CUDA-aware multi-node multi-GPU MPI applications\n", 16 | "* Profiling the application using NVIDIA Nsight Systems\n", 17 | "* Applying optimizations like CUDA streams, events, and overlapping compute and communication\n", 18 | "* Understanding GPUDirect technologies like P2P and RDMA\n", 19 | "* Learning to use NVIDIA NCCL and NVSHMEM libraries\n", 20 | "\n", 21 | "### Bootcamp Duration\n", 22 | "\n", 23 | "The bootcamp will take 8 hours to complete. Link to download all materials will be available at the end of the lab.\n", 24 | "\n", 25 | "### Content Level\n", 26 | "Intermediate, Advanced\n", 27 | "\n", 28 | "### Target Audience and Prerequisites\n", 29 | "The target audience for this lab are researchers, graduate students, and developers who are interested in scaling their scientific applications to multiple nodes using multi-GPU implementations.\n", 30 | "\n", 31 | "Experience in C/ C++ and basic CUDA programming is required. Experience with parallel programming frameworks like OpenMP or MPI is not required but a basic understanding of MPI is highly recommended.\n", 32 | "\n", 33 | "### Bootcamp Outline\n", 34 | "\n", 35 | "We will take up the Jacobi Solver, an iterative technique for solving system of linear equations, in this tutorial. To begin, click on the first link below:\n", 36 | "\n", 37 | "1. [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n", 38 | "2. Single Node Multi-GPU:\n", 39 | " * [CUDA Memcpy and Peer-to-Peer Memory Access](C/jupyter_notebook/cuda/memcpy.ipynb)\n", 40 | " * [Intra-node topology](C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb)\n", 41 | " * [CUDA Streams and Events](C/jupyter_notebook/cuda/streams.ipynb)\n", 42 | "3. Multi-Node Multi-GPU:\n", 43 | " * [Introduction to MPI and Multi-Node execution overview](C/jupyter_notebook/mpi/multi_node_intro.ipynb)\n", 44 | " * [MPI with CUDA Memcpy](C/jupyter_notebook/mpi/memcpy.ipynb)\n", 45 | " * [CUDA-aware MPI](C/jupyter_notebook/mpi/cuda_aware.ipynb)\n", 46 | " * [Supplemental: Configuring MPI in a containerized environment](C/jupyter_notebook/mpi/containers_and_mpi.ipynb)\n", 47 | "4. [NVIDIA Collectives Communications Library (NCCL)](C/jupyter_notebook/nccl/nccl.ipynb)\n", 48 | "5. [NVHSMEM Library](C/jupyter_notebook/nvshmem/nvshmem.ipynb)\n", 49 | "\n", 50 | "--- \n", 51 | "\n", 52 | "## Licensing \n", 53 | "\n", 54 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.7.4" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_multiprocess.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef HELPER_MULTIPROCESS_H 29 | #define HELPER_MULTIPROCESS_H 30 | 31 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 32 | #ifndef WIN32_LEAN_AND_MEAN 33 | #define WIN32_LEAN_AND_MEAN 34 | #endif 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #else 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #endif 55 | #include 56 | 57 | typedef struct sharedMemoryInfo_st { 58 | void *addr; 59 | size_t size; 60 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 61 | HANDLE shmHandle; 62 | #else 63 | int shmFd; 64 | #endif 65 | } sharedMemoryInfo; 66 | 67 | int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info); 68 | 69 | int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info); 70 | 71 | void sharedMemoryClose(sharedMemoryInfo *info); 72 | 73 | 74 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 75 | typedef PROCESS_INFORMATION Process; 76 | #else 77 | typedef pid_t Process; 78 | #endif 79 | 80 | int spawnProcess(Process *process, const char *app, char * const *args); 81 | 82 | int waitProcess(Process *process); 83 | 84 | #define checkIpcErrors(ipcFuncResult) \ 85 | if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); } 86 | 87 | #if defined(__linux__) 88 | struct ipcHandle_st { 89 | int socket; 90 | char *socketName; 91 | }; 92 | typedef int ShareableHandle; 93 | #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 94 | struct ipcHandle_st { 95 | std::vector hMailslot; // 1 Handle in case of child and `num children` Handles for parent. 96 | }; 97 | typedef HANDLE ShareableHandle; 98 | #endif 99 | 100 | typedef struct ipcHandle_st ipcHandle; 101 | 102 | int 103 | ipcCreateSocket(ipcHandle *&handle, const char *name, const std::vector& processes); 104 | 105 | int 106 | ipcOpenSocket(ipcHandle *&handle); 107 | 108 | int 109 | ipcCloseSocket(ipcHandle *handle); 110 | 111 | int 112 | ipcRecvShareableHandles(ipcHandle *handle, std::vector& shareableHandles); 113 | 114 | int 115 | ipcSendShareableHandles(ipcHandle *handle, const std::vector& shareableHandles, const std::vector& processes); 116 | 117 | int 118 | ipcCloseShareableHandle(ShareableHandle shHandle); 119 | 120 | #endif // HELPER_MULTIPROCESS_H 121 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsNPP.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNALS_NPP_H 30 | #define NV_UTIL_NPP_SIGNALS_NPP_H 31 | 32 | #include "Exceptions.h" 33 | #include "Signal.h" 34 | 35 | #include "SignalAllocatorsNPP.h" 36 | #include 37 | 38 | namespace npp 39 | { 40 | // forward declaration 41 | template class SignalCPU; 42 | 43 | template 44 | class SignalNPP: public npp::SignalTemplate > 45 | { 46 | public: 47 | SignalNPP() 48 | { 49 | ; 50 | } 51 | 52 | explicit 53 | SignalNPP(size_t nSize): SignalTemplate >(nSize) 54 | { 55 | ; 56 | } 57 | 58 | SignalNPP(const SignalNPP &rSignal): SignalTemplate >(rSignal) 59 | { 60 | ; 61 | } 62 | 63 | template 64 | explicit 65 | SignalNPP(const SignalCPU &rSignal): SignalTemplate >(rSignal.size()) 66 | { 67 | npp::SignalAllocator::HostToDeviceCopy1D(SignalTemplate >::values(), 68 | rSignal.values(), SignalTemplate >::size()); 69 | } 70 | 71 | virtual 72 | ~SignalNPP() 73 | { 74 | ; 75 | } 76 | 77 | SignalNPP & 78 | operator= (const SignalNPP &rSignal) 79 | { 80 | SignalTemplate >::operator= (rSignal); 81 | 82 | return *this; 83 | } 84 | 85 | void 86 | copyTo(D *pValues) 87 | const 88 | { 89 | npp::SignalAllocator::DeviceToHostCopy1D(pValues, SignalTemplate >::values(), SignalTemplate >::size()); 90 | } 91 | 92 | void 93 | copyFrom(D *pValues) 94 | { 95 | npp::SignalAllocator::HostToDeviceCopy1D(SignalTemplate >::values(), pValues, SignalTemplate >::size()); 96 | } 97 | }; 98 | 99 | typedef SignalNPP SignalNPP_8u; 100 | typedef SignalNPP SignalNPP_16s; 101 | typedef SignalNPP SignalNPP_16sc; 102 | typedef SignalNPP SignalNPP_32s; 103 | typedef SignalNPP SignalNPP_32sc; 104 | typedef SignalNPP SignalNPP_32f; 105 | typedef SignalNPP SignalNPP_32fc; 106 | typedef SignalNPP SignalNPP_64s; 107 | typedef SignalNPP SignalNPP_64sc; 108 | typedef SignalNPP SignalNPP_64f; 109 | typedef SignalNPP SignalNPP_64fc; 110 | 111 | } // npp namespace 112 | 113 | #endif // NV_UTIL_NPP_SIGNALS_NPP_H 114 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_ext.h: -------------------------------------------------------------------------------- 1 | #ifndef __FREEGLUT_EXT_H__ 2 | #define __FREEGLUT_EXT_H__ 3 | 4 | /* 5 | * freeglut_ext.h 6 | * 7 | * The non-GLUT-compatible extensions to the freeglut library include file 8 | * 9 | * Copyright (c) 1999-2000 Pawel W. Olszta. All Rights Reserved. 10 | * Written by Pawel W. Olszta, 11 | * Creation date: Thu Dec 2 1999 12 | * 13 | * Permission is hereby granted, free of charge, to any person obtaining a 14 | * copy of this software and associated documentation files (the "Software"), 15 | * to deal in the Software without restriction, including without limitation 16 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 17 | * and/or sell copies of the Software, and to permit persons to whom the 18 | * Software is furnished to do so, subject to the following conditions: 19 | * 20 | * The above copyright notice and this permission notice shall be included 21 | * in all copies or substantial portions of the Software. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 24 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 26 | * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 27 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 28 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | */ 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | /* 36 | * GLUT API Extension macro definitions -- behaviour when the user clicks on an "x" to close a window 37 | */ 38 | #define GLUT_ACTION_EXIT 0 39 | #define GLUT_ACTION_GLUTMAINLOOP_RETURNS 1 40 | #define GLUT_ACTION_CONTINUE_EXECUTION 2 41 | 42 | /* 43 | * Create a new rendering context when the user opens a new window? 44 | */ 45 | #define GLUT_CREATE_NEW_CONTEXT 0 46 | #define GLUT_USE_CURRENT_CONTEXT 1 47 | 48 | /* 49 | * GLUT API Extension macro definitions -- the glutGet parameters 50 | */ 51 | #define GLUT_ACTION_ON_WINDOW_CLOSE 0x01F9 52 | 53 | #define GLUT_WINDOW_BORDER_WIDTH 0x01FA 54 | #define GLUT_WINDOW_HEADER_HEIGHT 0x01FB 55 | 56 | #define GLUT_VERSION 0x01FC 57 | 58 | #define GLUT_RENDERING_CONTEXT 0x01FD 59 | 60 | /* 61 | * Process loop function, see freeglut_main.c 62 | */ 63 | FGAPI void FGAPIENTRY glutMainLoopEvent(void); 64 | FGAPI void FGAPIENTRY glutLeaveMainLoop(void); 65 | 66 | /* 67 | * Window-specific callback functions, see freeglut_callbacks.c 68 | */ 69 | FGAPI void FGAPIENTRY glutMouseWheelFunc(void (* callback)(int, int, int, int)); 70 | FGAPI void FGAPIENTRY glutCloseFunc(void (* callback)(void)); 71 | FGAPI void FGAPIENTRY glutWMCloseFunc(void (* callback)(void)); 72 | /* A. Donev: Also a destruction callback for menus */ 73 | FGAPI void FGAPIENTRY glutMenuDestroyFunc(void (* callback)(void)); 74 | 75 | /* 76 | * State setting and retrieval functions, see freeglut_state.c 77 | */ 78 | FGAPI void FGAPIENTRY glutSetOption(GLenum option_flag, int value) ; 79 | /* A.Donev: User-data manipulation */ 80 | FGAPI void *FGAPIENTRY glutGetWindowData(void); 81 | FGAPI void FGAPIENTRY glutSetWindowData(void *data); 82 | FGAPI void *FGAPIENTRY glutGetMenuData(void); 83 | FGAPI void FGAPIENTRY glutSetMenuData(void *data); 84 | 85 | /* 86 | * Font stuff, see freeglut_font.c 87 | */ 88 | FGAPI int FGAPIENTRY glutBitmapHeight(void *font); 89 | FGAPI GLfloat FGAPIENTRY glutStrokeHeight(void *font); 90 | FGAPI void FGAPIENTRY glutBitmapString(void *font, const unsigned char *string); 91 | FGAPI void FGAPIENTRY glutStrokeString(void *font, const unsigned char *string); 92 | 93 | /* 94 | * Geometry functions, see freeglut_geometry.c 95 | */ 96 | FGAPI void FGAPIENTRY glutWireRhombicDodecahedron(void); 97 | FGAPI void FGAPIENTRY glutSolidRhombicDodecahedron(void); 98 | FGAPI void FGAPIENTRY glutWireSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ; 99 | FGAPI void FGAPIENTRY glutSolidSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ; 100 | FGAPI void FGAPIENTRY glutWireCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks); 101 | FGAPI void FGAPIENTRY glutSolidCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks); 102 | 103 | /* 104 | * Extension functions, see freeglut_ext.c 105 | */ 106 | FGAPI void *FGAPIENTRY glutGetProcAddress(const char *procName); 107 | 108 | 109 | #ifdef __cplusplus 110 | } 111 | #endif 112 | 113 | /*** END OF FILE ***/ 114 | 115 | #endif /* __FREEGLUT_EXT_H__ */ 116 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGES_CPU_H 29 | #define NV_UTIL_NPP_IMAGES_CPU_H 30 | 31 | #include "ImagePacked.h" 32 | 33 | #include "ImageAllocatorsCPU.h" 34 | #include "Exceptions.h" 35 | 36 | #include 37 | 38 | 39 | namespace npp 40 | { 41 | 42 | template 43 | class ImageCPU: public npp::ImagePacked 44 | { 45 | public: 46 | 47 | ImageCPU() 48 | { 49 | ; 50 | } 51 | 52 | ImageCPU(unsigned int nWidth, unsigned int nHeight): ImagePacked(nWidth, nHeight) 53 | { 54 | ; 55 | } 56 | 57 | explicit 58 | ImageCPU(const npp::Image::Size &rSize): ImagePacked(rSize) 59 | { 60 | ; 61 | } 62 | 63 | ImageCPU(const ImageCPU &rImage): Image(rImage) 64 | { 65 | ; 66 | } 67 | 68 | virtual 69 | ~ImageCPU() 70 | { 71 | ; 72 | } 73 | 74 | ImageCPU & 75 | operator= (const ImageCPU &rImage) 76 | { 77 | ImagePacked::operator= (rImage); 78 | 79 | return *this; 80 | } 81 | 82 | npp::Pixel & 83 | operator()(unsigned int iX, unsigned int iY) 84 | { 85 | return *ImagePacked::pixels(iX, iY); 86 | } 87 | 88 | npp::Pixel 89 | operator()(unsigned int iX, unsigned int iY) 90 | const 91 | { 92 | return *ImagePacked::pixels(iX, iY); 93 | } 94 | 95 | }; 96 | 97 | 98 | typedef ImageCPU > ImageCPU_8u_C1; 99 | typedef ImageCPU > ImageCPU_8u_C2; 100 | typedef ImageCPU > ImageCPU_8u_C3; 101 | typedef ImageCPU > ImageCPU_8u_C4; 102 | 103 | typedef ImageCPU > ImageCPU_16u_C1; 104 | typedef ImageCPU > ImageCPU_16u_C3; 105 | typedef ImageCPU > ImageCPU_16u_C4; 106 | 107 | typedef ImageCPU > ImageCPU_16s_C1; 108 | typedef ImageCPU > ImageCPU_16s_C3; 109 | typedef ImageCPU > ImageCPU_16s_C4; 110 | 111 | typedef ImageCPU > ImageCPU_32s_C1; 112 | typedef ImageCPU > ImageCPU_32s_C3; 113 | typedef ImageCPU > ImageCPU_32s_C4; 114 | 115 | typedef ImageCPU > ImageCPU_32f_C1; 116 | typedef ImageCPU > ImageCPU_32f_C3; 117 | typedef ImageCPU > ImageCPU_32f_C4; 118 | 119 | } // npp namespace 120 | 121 | #endif // NV_IMAGE_IPP_H 122 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Image.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGE_H 29 | #define NV_UTIL_NPP_IMAGE_H 30 | 31 | #include 32 | 33 | namespace npp 34 | { 35 | 36 | class Image 37 | { 38 | public: 39 | struct Size 40 | { 41 | unsigned int nWidth; 42 | unsigned int nHeight; 43 | 44 | Size() : nWidth(0), nHeight(0) 45 | { }; 46 | 47 | Size(unsigned int nWidthNew, unsigned nHeightNew) : nWidth(nWidthNew), nHeight(nHeightNew) 48 | { }; 49 | 50 | Size(const Size &rSize) : nWidth(rSize.nWidth), nHeight(rSize.nHeight) 51 | { }; 52 | 53 | Size & 54 | operator= (const Size &rSize) 55 | { 56 | if (&rSize == this) 57 | { 58 | return *this; 59 | } 60 | 61 | nWidth = rSize.nWidth; 62 | nHeight = rSize.nHeight; 63 | 64 | return *this; 65 | } 66 | 67 | void 68 | swap(Size &rSize) 69 | { 70 | unsigned int nTemp; 71 | nTemp = nWidth; 72 | nWidth = rSize.nWidth; 73 | rSize.nWidth = nTemp; 74 | 75 | nTemp = nHeight; 76 | nHeight = rSize.nHeight; 77 | rSize.nHeight = nTemp; 78 | } 79 | }; 80 | 81 | Image() 82 | { }; 83 | 84 | Image(unsigned int nWidth, unsigned int nHeight) : oSize_(nWidth, nHeight) 85 | { }; 86 | 87 | Image(const Image::Size &rSize) : oSize_(rSize) 88 | { }; 89 | 90 | Image(const Image &rImage) : oSize_(rImage.oSize_) 91 | { }; 92 | 93 | virtual 94 | ~Image() 95 | { }; 96 | 97 | Image & 98 | operator= (const Image &rImage) 99 | { 100 | if (&rImage == this) 101 | { 102 | return *this; 103 | } 104 | 105 | oSize_ = rImage.oSize_; 106 | return *this; 107 | }; 108 | 109 | unsigned int 110 | width() 111 | const 112 | { 113 | return oSize_.nWidth; 114 | } 115 | 116 | unsigned int 117 | height() 118 | const 119 | { 120 | return oSize_.nHeight; 121 | } 122 | 123 | Size 124 | size() 125 | const 126 | { 127 | return oSize_; 128 | } 129 | 130 | void 131 | swap(Image &rImage) 132 | { 133 | oSize_.swap(rImage.oSize_); 134 | } 135 | 136 | private: 137 | Size oSize_; 138 | }; 139 | 140 | bool 141 | operator== (const Image::Size &rFirst, const Image::Size &rSecond) 142 | { 143 | return rFirst.nWidth == rSecond.nWidth && rFirst.nHeight == rSecond.nHeight; 144 | } 145 | 146 | bool 147 | operator!= (const Image::Size &rFirst, const Image::Size &rSecond) 148 | { 149 | return rFirst.nWidth != rSecond.nWidth || rFirst.nHeight != rSecond.nHeight; 150 | } 151 | 152 | } // npp namespace 153 | 154 | 155 | #endif // NV_UTIL_NPP_IMAGE_H 156 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/mpi/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | 29 | #define BLOCK_DIM_X 32 30 | #define BLOCK_DIM_Y 32 31 | 32 | #define CUDA_RT_CALL(call) \ 33 | { \ 34 | cudaError_t cudaStatus = call; \ 35 | if (cudaSuccess != cudaStatus) \ 36 | fprintf(stderr, \ 37 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 38 | "with " \ 39 | "%s (%d).\n", \ 40 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 41 | } 42 | 43 | __global__ void initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 44 | const int nx, const int my_ny, const int ny) { 45 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 46 | const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 47 | a[iy * nx + 0] = y0; 48 | a[iy * nx + (nx - 1)] = y0; 49 | a_new[iy * nx + 0] = y0; 50 | a_new[iy * nx + (nx - 1)] = y0; 51 | } 52 | } 53 | 54 | __global__ void jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 55 | const int iy_end, const int nx) { 56 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 57 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 58 | __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y]; 59 | unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x; 60 | 61 | if (iy < iy_end && ix < (nx - 1)) { 62 | // Update grid point 63 | const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 64 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 65 | a_new[iy * nx + ix] = new_val; 66 | float residue = new_val - a[iy * nx + ix]; 67 | // Set block-level L2 norm value for this grid point 68 | block_l2_sum[thread_index] = residue * residue; 69 | } 70 | else { 71 | block_l2_sum[thread_index] = 0; 72 | } 73 | // Reduce L2 norm for the block in parallel 74 | for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) { 75 | __syncthreads(); 76 | if ((thread_index) % (2*stride) == 0) { 77 | block_l2_sum[thread_index] += block_l2_sum[thread_index + stride]; 78 | } 79 | } 80 | // Atomically update global L2 norm with block-reduced L2 norm 81 | if (thread_index == 0) { 82 | atomicAdd(l2_norm, block_l2_sum[0]); 83 | } 84 | } 85 | 86 | void launch_initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 87 | const int nx, const int my_ny, const int ny){ 88 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 89 | } 90 | 91 | void launch_jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 92 | const int iy_end, const int nx) { 93 | dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1); 94 | dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, 95 | ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1); 96 | jacobi_kernel<<>>(a_new, a, l2_norm, iy_start, iy_end, nx); 97 | } 98 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/mpi/containerization/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | 29 | #define BLOCK_DIM_X 32 30 | #define BLOCK_DIM_Y 32 31 | 32 | #define CUDA_RT_CALL(call) \ 33 | { \ 34 | cudaError_t cudaStatus = call; \ 35 | if (cudaSuccess != cudaStatus) \ 36 | fprintf(stderr, \ 37 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 38 | "with " \ 39 | "%s (%d).\n", \ 40 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 41 | } 42 | 43 | __global__ void initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 44 | const int nx, const int my_ny, const int ny) { 45 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 46 | const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 47 | a[iy * nx + 0] = y0; 48 | a[iy * nx + (nx - 1)] = y0; 49 | a_new[iy * nx + 0] = y0; 50 | a_new[iy * nx + (nx - 1)] = y0; 51 | } 52 | } 53 | 54 | __global__ void jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 55 | const int iy_end, const int nx) { 56 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 57 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 58 | __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y]; 59 | unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x; 60 | 61 | if (iy < iy_end && ix < (nx - 1)) { 62 | // Update grid point 63 | const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 64 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 65 | a_new[iy * nx + ix] = new_val; 66 | float residue = new_val - a[iy * nx + ix]; 67 | // Set block-level L2 norm value for this grid point 68 | block_l2_sum[thread_index] = residue * residue; 69 | } 70 | else { 71 | block_l2_sum[thread_index] = 0; 72 | } 73 | // Reduce L2 norm for the block in parallel 74 | for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) { 75 | __syncthreads(); 76 | if ((thread_index) % (2*stride) == 0) { 77 | block_l2_sum[thread_index] += block_l2_sum[thread_index + stride]; 78 | } 79 | } 80 | // Atomically update global L2 norm with block-reduced L2 norm 81 | if (thread_index == 0) { 82 | atomicAdd(l2_norm, block_l2_sum[0]); 83 | } 84 | } 85 | 86 | void launch_initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 87 | const int nx, const int my_ny, const int ny){ 88 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 89 | } 90 | 91 | void launch_jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 92 | const int iy_end, const int nx) { 93 | dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1); 94 | dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, 95 | ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1); 96 | jacobi_kernel<<>>(a_new, a, l2_norm, iy_start, iy_end, nx); 97 | } 98 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/nccl/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | 29 | #define BLOCK_DIM_X 32 30 | #define BLOCK_DIM_Y 32 31 | 32 | #define CUDA_RT_CALL(call) \ 33 | { \ 34 | cudaError_t cudaStatus = call; \ 35 | if (cudaSuccess != cudaStatus) \ 36 | fprintf(stderr, \ 37 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 38 | "with " \ 39 | "%s (%d).\n", \ 40 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 41 | } 42 | 43 | __global__ void initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 44 | const int nx, const int my_ny, const int ny) { 45 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 46 | const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 47 | a[iy * nx + 0] = y0; 48 | a[iy * nx + (nx - 1)] = y0; 49 | a_new[iy * nx + 0] = y0; 50 | a_new[iy * nx + (nx - 1)] = y0; 51 | } 52 | } 53 | 54 | __global__ void jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 55 | const int iy_end, const int nx) { 56 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 57 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 58 | __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y]; 59 | unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x; 60 | 61 | if (iy < iy_end && ix < (nx - 1)) { 62 | // Update grid point 63 | const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 64 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 65 | a_new[iy * nx + ix] = new_val; 66 | float residue = new_val - a[iy * nx + ix]; 67 | // Set block-level L2 norm value for this grid point 68 | block_l2_sum[thread_index] = residue * residue; 69 | } 70 | else { 71 | block_l2_sum[thread_index] = 0; 72 | } 73 | // Reduce L2 norm for the block in parallel 74 | for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) { 75 | __syncthreads(); 76 | if ((thread_index) % (2*stride) == 0) { 77 | block_l2_sum[thread_index] += block_l2_sum[thread_index + stride]; 78 | } 79 | } 80 | // Atomically update global L2 norm with block-reduced L2 norm 81 | if (thread_index == 0) { 82 | atomicAdd(l2_norm, block_l2_sum[0]); 83 | } 84 | } 85 | 86 | void launch_initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 87 | const int nx, const int my_ny, const int ny) { 88 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 89 | } 90 | 91 | void launch_jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 92 | const int iy_end, const int nx, cudaStream_t stream) { 93 | dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1); 94 | dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, 95 | ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1); 96 | jacobi_kernel<<>>(a_new, a, l2_norm, iy_start, iy_end, nx); 97 | } 98 | 99 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | //////////////////////////////////////////////////////////////////////////////// 29 | // 30 | // Utility funcs to wrap up saving a surface or the back buffer as a PPM file 31 | // In addition, wraps up a threshold comparision of two PPMs. 32 | // 33 | // These functions are designed to be used to implement an automated QA testing for SDK samples. 34 | // 35 | // Author: Bryan Dudash 36 | // Email: sdkfeedback@nvidia.com 37 | // 38 | // Copyright (c) NVIDIA Corporation. All rights reserved. 39 | //////////////////////////////////////////////////////////////////////////////// 40 | 41 | #include 42 | #include 43 | 44 | HRESULT CheckRenderD3D11::ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName) 45 | { 46 | ID3D11DeviceContext *pDeviceCtxt; 47 | pDevice->GetImmediateContext(&pDeviceCtxt); 48 | ID3D11RenderTargetView *pRTV = NULL; 49 | pDeviceCtxt->OMGetRenderTargets(1,&pRTV,NULL); 50 | 51 | ID3D11Resource *pSourceResource = NULL; 52 | pRTV->GetResource(&pSourceResource); 53 | 54 | return ResourceToPPM(pDevice,pSourceResource,zFileName); 55 | } 56 | 57 | HRESULT CheckRenderD3D11::ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName) 58 | { 59 | ID3D11DeviceContext *pDeviceCtxt; 60 | pDevice->GetImmediateContext(&pDeviceCtxt); 61 | D3D11_RESOURCE_DIMENSION rType; 62 | pResource->GetType(&rType); 63 | 64 | if (rType != D3D11_RESOURCE_DIMENSION_TEXTURE2D) 65 | { 66 | printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n"); 67 | return E_FAIL; 68 | } 69 | 70 | ID3D11Texture2D *pSourceTexture = (ID3D11Texture2D *)pResource; 71 | ID3D11Texture2D *pTargetTexture = NULL; 72 | 73 | D3D11_TEXTURE2D_DESC desc; 74 | pSourceTexture->GetDesc(&desc); 75 | desc.BindFlags = 0; 76 | desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; 77 | desc.Usage = D3D11_USAGE_STAGING; 78 | 79 | if (FAILED(pDevice->CreateTexture2D(&desc,NULL,&pTargetTexture))) 80 | { 81 | printf("SurfaceToPPM: Unable to create target Texture resoruce! Aborting... \n"); 82 | return E_FAIL; 83 | } 84 | 85 | pDeviceCtxt->CopyResource(pTargetTexture,pSourceTexture); 86 | 87 | D3D11_MAPPED_SUBRESOURCE mappedTex2D; 88 | pDeviceCtxt->Map(pTargetTexture, 0, D3D11_MAP_READ,0,&mappedTex2D); 89 | 90 | // Need to convert from dx pitch to pitch=width 91 | unsigned char *pPPMData = new unsigned char[desc.Width*desc.Height*4]; 92 | 93 | for (unsigned int iHeight = 0; iHeightUnmap(pTargetTexture, 0); 99 | 100 | // Prepends the PPM header info and bumps byte data afterwards 101 | sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height); 102 | 103 | delete [] pPPMData; 104 | pTargetTexture->Release(); 105 | 106 | return S_OK; 107 | } 108 | 109 | bool CheckRenderD3D11::PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path, 110 | const float epsilon, const float threshold) 111 | { 112 | char *ref_file_path = sdkFindFilePath(ref_file, exec_path); 113 | 114 | if (ref_file_path == NULL) 115 | { 116 | printf("CheckRenderD3D11::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, exec_path); 117 | printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); 118 | printf("Aborting comparison!\n"); 119 | printf(" FAILURE!\n"); 120 | return false; 121 | } 122 | 123 | return sdkComparePPM(src_file,ref_file_path,epsilon,threshold,true) == true; 124 | } -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Signal.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNAL_H 30 | #define NV_UTIL_NPP_SIGNAL_H 31 | 32 | #include 33 | 34 | namespace npp 35 | { 36 | class Signal 37 | { 38 | public: 39 | Signal() : nSize_(0) 40 | { }; 41 | 42 | explicit 43 | Signal(size_t nSize) : nSize_(nSize) 44 | { }; 45 | 46 | Signal(const Signal &rSignal) : nSize_(rSignal.nSize_) 47 | { }; 48 | 49 | virtual 50 | ~Signal() 51 | { } 52 | 53 | Signal & 54 | operator= (const Signal &rSignal) 55 | { 56 | nSize_ = rSignal.nSize_; 57 | return *this; 58 | } 59 | 60 | size_t 61 | size() 62 | const 63 | { 64 | return nSize_; 65 | } 66 | 67 | void 68 | swap(Signal &rSignal) 69 | { 70 | size_t nTemp = nSize_; 71 | nSize_ = rSignal.nSize_; 72 | rSignal.nSize_ = nTemp; 73 | } 74 | 75 | 76 | private: 77 | size_t nSize_; 78 | }; 79 | 80 | template 81 | class SignalTemplate: public Signal 82 | { 83 | public: 84 | typedef D tData; 85 | 86 | SignalTemplate(): aValues_(0) 87 | { 88 | ; 89 | } 90 | 91 | SignalTemplate(size_t nSize): Signal(nSize) 92 | , aValues_(0) 93 | { 94 | aValues_ = A::Malloc1D(size()); 95 | } 96 | 97 | SignalTemplate(const SignalTemplate &rSignal): Signal(rSignal) 98 | , aValues_(0) 99 | { 100 | aValues_ = A::Malloc1D(size()); 101 | A::Copy1D(aValues_, rSignal.values(), size()); 102 | } 103 | 104 | virtual 105 | ~SignalTemplate() 106 | { 107 | A::Free1D(aValues_); 108 | } 109 | 110 | SignalTemplate & 111 | operator= (const SignalTemplate &rSignal) 112 | { 113 | // in case of self-assignment 114 | if (&rSignal == this) 115 | { 116 | return *this; 117 | } 118 | 119 | A::Free1D(aValues_); 120 | this->aPixels_ = 0; 121 | 122 | // assign parent class's data fields (width, height) 123 | Signal::operator =(rSignal); 124 | 125 | aValues_ = A::Malloc1D(size()); 126 | A::Copy1D(aValues_, rSignal.value(), size()); 127 | 128 | return *this; 129 | } 130 | 131 | /// Get a pointer to the pixel array. 132 | /// The result pointer can be offset to pixel at position (x, y) and 133 | /// even negative offsets are allowed. 134 | /// \param nX Horizontal pointer/array offset. 135 | /// \param nY Vertical pointer/array offset. 136 | /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY). 137 | tData * 138 | values(int i = 0) 139 | { 140 | return aValues_ + i; 141 | } 142 | 143 | const 144 | tData * 145 | values(int i = 0) 146 | const 147 | { 148 | return aValues_ + i; 149 | } 150 | 151 | void 152 | swap(SignalTemplate &rSignal) 153 | { 154 | Signal::swap(rSignal); 155 | 156 | tData *aTemp = this->aValues_; 157 | this->aValues_ = rSignal.aValues_; 158 | rSignal.aValues_ = aTemp; 159 | } 160 | 161 | private: 162 | D *aValues_; 163 | }; 164 | 165 | } // npp namespace 166 | 167 | 168 | #endif // NV_UTIL_NPP_SIGNAL_H 169 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cusolver.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef HELPER_CUSOLVER 29 | #define HELPER_CUSOLVER 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "cusparse.h" 39 | 40 | #define SWITCH_CHAR '-' 41 | 42 | struct testOpts { 43 | char *sparse_mat_filename; // by switch -F 44 | const char *testFunc; // by switch -R 45 | const char *reorder; // by switch -P 46 | int lda; // by switch -lda 47 | }; 48 | 49 | double vec_norminf(int n, const double *x) { 50 | double norminf = 0; 51 | for (int j = 0; j < n; j++) { 52 | double x_abs = fabs(x[j]); 53 | norminf = (norminf > x_abs) ? norminf : x_abs; 54 | } 55 | return norminf; 56 | } 57 | 58 | /* 59 | * |A| = max { |A|*ones(m,1) } 60 | */ 61 | double mat_norminf(int m, int n, const double *A, int lda) { 62 | double norminf = 0; 63 | for (int i = 0; i < m; i++) { 64 | double sum = 0.0; 65 | for (int j = 0; j < n; j++) { 66 | double A_abs = fabs(A[i + j * lda]); 67 | sum += A_abs; 68 | } 69 | norminf = (norminf > sum) ? norminf : sum; 70 | } 71 | return norminf; 72 | } 73 | 74 | /* 75 | * |A| = max { |A|*ones(m,1) } 76 | */ 77 | double csr_mat_norminf(int m, int n, int nnzA, const cusparseMatDescr_t descrA, 78 | const double *csrValA, const int *csrRowPtrA, 79 | const int *csrColIndA) { 80 | const int baseA = 81 | (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0; 82 | 83 | double norminf = 0; 84 | for (int i = 0; i < m; i++) { 85 | double sum = 0.0; 86 | const int start = csrRowPtrA[i] - baseA; 87 | const int end = csrRowPtrA[i + 1] - baseA; 88 | for (int colidx = start; colidx < end; colidx++) { 89 | // const int j = csrColIndA[colidx] - baseA; 90 | double A_abs = fabs(csrValA[colidx]); 91 | sum += A_abs; 92 | } 93 | norminf = (norminf > sum) ? norminf : sum; 94 | } 95 | return norminf; 96 | } 97 | 98 | void display_matrix(int m, int n, int nnzA, const cusparseMatDescr_t descrA, 99 | const double *csrValA, const int *csrRowPtrA, 100 | const int *csrColIndA) { 101 | const int baseA = 102 | (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0; 103 | 104 | printf("m = %d, n = %d, nnz = %d, matlab base-1\n", m, n, nnzA); 105 | 106 | for (int row = 0; row < m; row++) { 107 | const int start = csrRowPtrA[row] - baseA; 108 | const int end = csrRowPtrA[row + 1] - baseA; 109 | for (int colidx = start; colidx < end; colidx++) { 110 | const int col = csrColIndA[colidx] - baseA; 111 | double Areg = csrValA[colidx]; 112 | printf("A(%d, %d) = %20.16E\n", row + 1, col + 1, Areg); 113 | } 114 | } 115 | } 116 | 117 | #if defined(_WIN32) 118 | #if !defined(WIN32_LEAN_AND_MEAN) 119 | #define WIN32_LEAN_AND_MEAN 120 | #endif 121 | #include 122 | double second(void) { 123 | LARGE_INTEGER t; 124 | static double oofreq; 125 | static int checkedForHighResTimer; 126 | static BOOL hasHighResTimer; 127 | 128 | if (!checkedForHighResTimer) { 129 | hasHighResTimer = QueryPerformanceFrequency(&t); 130 | oofreq = 1.0 / (double)t.QuadPart; 131 | checkedForHighResTimer = 1; 132 | } 133 | if (hasHighResTimer) { 134 | QueryPerformanceCounter(&t); 135 | return (double)t.QuadPart * oofreq; 136 | } else { 137 | return (double)GetTickCount() / 1000.0; 138 | } 139 | } 140 | 141 | #elif defined(__linux__) || defined(__QNX__) 142 | #include 143 | #include 144 | #include 145 | double second(void) { 146 | struct timeval tv; 147 | gettimeofday(&tv, NULL); 148 | return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; 149 | } 150 | 151 | #elif defined(__APPLE__) 152 | #include 153 | #include 154 | #include 155 | #include 156 | #include 157 | double second(void) { 158 | struct timeval tv; 159 | gettimeofday(&tv, NULL); 160 | return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; 161 | } 162 | #else 163 | #error unsupported platform 164 | #endif 165 | 166 | #endif 167 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageIO.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGE_IO_H 29 | #define NV_UTIL_NPP_IMAGE_IO_H 30 | 31 | #include "ImagesCPU.h" 32 | #include "ImagesNPP.h" 33 | 34 | #include "FreeImage.h" 35 | #include "Exceptions.h" 36 | 37 | #include 38 | #include "string.h" 39 | 40 | 41 | // Error handler for FreeImage library. 42 | // In case this handler is invoked, it throws an NPP exception. 43 | void 44 | FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char *zMessage) 45 | { 46 | throw npp::Exception(zMessage); 47 | } 48 | 49 | namespace npp 50 | { 51 | // Load a gray-scale image from disk. 52 | void 53 | loadImage(const std::string &rFileName, ImageCPU_8u_C1 &rImage) 54 | { 55 | // set your own FreeImage error handler 56 | FreeImage_SetOutputMessage(FreeImageErrorHandler); 57 | 58 | FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(rFileName.c_str()); 59 | 60 | // no signature? try to guess the file format from the file extension 61 | if (eFormat == FIF_UNKNOWN) 62 | { 63 | eFormat = FreeImage_GetFIFFromFilename(rFileName.c_str()); 64 | } 65 | 66 | NPP_ASSERT(eFormat != FIF_UNKNOWN); 67 | // check that the plugin has reading capabilities ... 68 | FIBITMAP *pBitmap; 69 | 70 | if (FreeImage_FIFSupportsReading(eFormat)) 71 | { 72 | pBitmap = FreeImage_Load(eFormat, rFileName.c_str()); 73 | } 74 | 75 | NPP_ASSERT(pBitmap != 0); 76 | // make sure this is an 8-bit single channel image 77 | NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); 78 | NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); 79 | 80 | // create an ImageCPU to receive the loaded image data 81 | ImageCPU_8u_C1 oImage(FreeImage_GetWidth(pBitmap), FreeImage_GetHeight(pBitmap)); 82 | 83 | // Copy the FreeImage data into the new ImageCPU 84 | unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); 85 | const Npp8u *pSrcLine = FreeImage_GetBits(pBitmap) + nSrcPitch * (FreeImage_GetHeight(pBitmap) -1); 86 | Npp8u *pDstLine = oImage.data(); 87 | unsigned int nDstPitch = oImage.pitch(); 88 | 89 | for (size_t iLine = 0; iLine < oImage.height(); ++iLine) 90 | { 91 | memcpy(pDstLine, pSrcLine, oImage.width() * sizeof(Npp8u)); 92 | pSrcLine -= nSrcPitch; 93 | pDstLine += nDstPitch; 94 | } 95 | 96 | // swap the user given image with our result image, effecively 97 | // moving our newly loaded image data into the user provided shell 98 | oImage.swap(rImage); 99 | } 100 | 101 | // Save an gray-scale image to disk. 102 | void 103 | saveImage(const std::string &rFileName, const ImageCPU_8u_C1 &rImage) 104 | { 105 | // create the result image storage using FreeImage so we can easily 106 | // save 107 | FIBITMAP *pResultBitmap = FreeImage_Allocate(rImage.width(), rImage.height(), 8 /* bits per pixel */); 108 | NPP_ASSERT_NOT_NULL(pResultBitmap); 109 | unsigned int nDstPitch = FreeImage_GetPitch(pResultBitmap); 110 | Npp8u *pDstLine = FreeImage_GetBits(pResultBitmap) + nDstPitch * (rImage.height()-1); 111 | const Npp8u *pSrcLine = rImage.data(); 112 | unsigned int nSrcPitch = rImage.pitch(); 113 | 114 | for (size_t iLine = 0; iLine < rImage.height(); ++iLine) 115 | { 116 | memcpy(pDstLine, pSrcLine, rImage.width() * sizeof(Npp8u)); 117 | pSrcLine += nSrcPitch; 118 | pDstLine -= nDstPitch; 119 | } 120 | 121 | // now save the result image 122 | bool bSuccess; 123 | bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, rFileName.c_str(), 0) == TRUE; 124 | NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); 125 | } 126 | 127 | // Load a gray-scale image from disk. 128 | void 129 | loadImage(const std::string &rFileName, ImageNPP_8u_C1 &rImage) 130 | { 131 | ImageCPU_8u_C1 oImage; 132 | loadImage(rFileName, oImage); 133 | ImageNPP_8u_C1 oResult(oImage); 134 | rImage.swap(oResult); 135 | } 136 | 137 | // Save an gray-scale image to disk. 138 | void 139 | saveImage(const std::string &rFileName, const ImageNPP_8u_C1 &rImage) 140 | { 141 | ImageCPU_8u_C1 oHostImage(rImage.size()); 142 | // copy the device result data 143 | rImage.copyTo(oHostImage.data(), oHostImage.pitch()); 144 | saveImage(rFileName, oHostImage); 145 | } 146 | } 147 | 148 | 149 | #endif // NV_UTIL_NPP_IMAGE_IO_H 150 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ------------ 3 | 4 | Please use the following guidelines when contributing to this project. 5 | 6 | Before contributing significant changes, please begin a discussion of the desired changes via a GitHub Issue to prevent doing unnecessary or overlapping work. 7 | 8 | ## License 9 | 10 | The preferred license for source code contributed to this project is the Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0) and for documentation, including Jupyter notebooks and text documentation, is the Creative Commons Attribution 4.0 International (CC BY 4.0) (https://creativecommons.org/licenses/by/4.0/). Contributions under other, compatible licenses will be considered on a case-by-case basis. 11 | 12 | ## Styling 13 | 14 | Please use the following style guidelines when making contributions. 15 | 16 | ### Source Code 17 | * Two-space indentation, no tabs 18 | * To the extent possible, variable names should be descriptive 19 | * Code should be documentation with detail like what function does and returns making the code readable. The code should also have proper license at the beginning of the file. 20 | * Fortran codes should use free-form source files 21 | * Fortran codes should not use implicit variable names and should use implicit none 22 | * The following file extensions should be used appropriately 23 | * C - .c 24 | * C++ - .cpp 25 | * CUDA C/C++ - .cu 26 | * CUDA Fortran - .cuf 27 | * Fortran - .F90 28 | 29 | ### Jupyter Notebooks & Markdown 30 | * When they appear inline with the text; directive names, clauses, function or subroutine names, variable names, file names, commands and command-line arguments should appear between two backticks. 31 | * Code blocks should begin with three backticks and either 'cpp' or 'fortran' to enable appropriate source formatting and end with three backticks. 32 | * Leave an empty line before and after the codeblock. 33 | Emphasis, including quotes made for emphasis and introduction of new terms should be highlighted between a single pair of asterisks 34 | * A level 1 heading should appear at the top of the notebook as the title of the notebook. 35 | * A horizontal rule should appear between sections that begin with a level 2 heading. 36 | 37 | Please refer to the following template for jupyter notebook styling in the [github](https://github.com/openhackathons-org/gpubootcamp/tree/master/misc). 38 | 39 | ## Contributing Labs/Modules 40 | 41 | ### Directory stucture for Github 42 | 43 | Before starting to work on new lab it is important to follow the recommended git structure as shown below to avoid and reformatting: 44 | 45 | ``` 46 | ├── labs/CFD 47 | │ ├── LICENSE 48 | │ └── English 49 | │ ├── C 50 | │ ├── images 51 | │ ├── jupyter_notebook 52 | │ ├── advanced_concepts 53 | │ ├── x.ipynb 54 | │ └── ... 55 | │ ├── cuda 56 | │ ├── mpi 57 | │ ├── nccl 58 | │ ├── nvshmem 59 | │ └── single_gpu 60 | │ └── source_code 61 | │ ├── cuda 62 | │ ├── solutions 63 | │ ├── x.cu 64 | │ └── ... 65 | │ ├── mpi 66 | │ ├── nccl 67 | │ ├── nvshmem 68 | │ ├── p2pBandwidthLatencyTest 69 | │ └── single_gpu 70 | │ ├── Presentations 71 | │ └── start_here.ipynb 72 | ├── slurm_pmi_config 73 | ├── README.md 74 | └── Singularity 75 | ``` 76 | 77 | Each lab will have following files/directories consisting of training material for the lab. 78 | * `jupyter_notebook` folder: Consists of jupyter notebooks for a specific programming language. 79 | * `source_code` folder: Source codes are stored in a separate directory for a specific programming language (C/C++ and Fortran). Source code folder may optionally contain Makefile especially for HPC labs. This folder may also contains `SOLUTIONS` folder for all the related solutions to that particular lab. 80 | * presentations: Consists of presentations for the labs ( pdf format is preferred ) 81 | * Dockerfile and Singularity: Each lab should have both Docker and Singularity recipes. 82 | 83 | The lab optionally may also add custom license in case of any deviation from the top level directory license ( Apache 2.0 ). 84 | 85 | ### Git Branching 86 | 87 | Adding a new feature/lab will follow a forking workflow. Which means a feature branch development will happen on a forked repo which later gets merged into our original project (GPUHackathons.org) repository. 88 | 89 | 90 | ![Git Branching Workflow](labs/CFD/English/C/images/git_branching.jpg) 91 | 92 | The 5 main steps depicted in image above are as follows: 93 | 1. Fork: To create a new lab/feature the GPUHackathons.org repository must be forked. Fork will create a snapshot of GPUHackathons.org repository at the time it was forked. Any new feature/lab that will be developed should be based on the develop branch of the repository. 94 | 2. Clone: Developer can than clone this new repository to local machine 95 | Create Feature Branch: Create a new branch with a feature name in which your changes will be done. Recommend naming convention of feature branch is naming convention for branch: multigpu-. The new changes that developer makes can be added, committed and pushed 96 | 3. Push: After the changes are committed, the developer pushes the changes to the remote branch. Push command helps the local changes to github repository 97 | 4. Pull: Submit a pull request. Upon receiving pull request a Hackathon team reviewer/owner will review the changes and upon accepting it can be merged into the develop branch of GpuHacakthons.org 98 | 99 | Git Branch details are as follows: 100 | 101 | * master branch: Consists of the stable branch. 102 | * origin/master to be the main branch where the source code of HEAD always reflects a production-ready state 103 | * Merge request is possible through: develop branch 104 | * develop branch: branched from master branch 105 | * Must branch from: master branch 106 | * Must merge back into: master branch 107 | * It is the main development branch where the source code of HEAD always reflects a state with the latest delivered development changes for the next release. 108 | * When the source code in the develop branch reaches a stable point and is ready to be released, all of the changes should be merged back into master somehow and then tagged with a release number 109 | * All feature development should happen by forking GPUHackathons.org and branching from develop branch only. 110 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagePacked.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGE_PACKED_H 29 | #define NV_UTIL_NPP_IMAGE_PACKED_H 30 | 31 | #include "Image.h" 32 | #include "Pixel.h" 33 | 34 | namespace npp 35 | { 36 | template 37 | class ImagePacked: public npp::Image 38 | { 39 | public: 40 | typedef npp::Pixel tPixel; 41 | typedef D tData; 42 | static const size_t gnChannels = N; 43 | typedef npp::Image::Size tSize; 44 | 45 | ImagePacked(): aPixels_(0) 46 | , nPitch_(0) 47 | { 48 | ; 49 | } 50 | 51 | ImagePacked(unsigned int nWidth, unsigned int nHeight): Image(nWidth, nHeight) 52 | , aPixels_(0) 53 | , nPitch_(0) 54 | { 55 | aPixels_ = A::Malloc2D(width(), height(), &nPitch_); 56 | } 57 | 58 | ImagePacked(unsigned int nWidth, unsigned int nHeight, bool bTight): Image(nWidth, nHeight) 59 | , aPixels_(0) 60 | , nPitch_(0) 61 | { 62 | aPixels_ = A::Malloc2D(width(), height(), &nPitch_, bTight); 63 | } 64 | 65 | ImagePacked(const tSize &rSize): Image(rSize) 66 | , aPixels_(0) 67 | , nPitch_(0) 68 | { 69 | aPixels_ = A::Malloc2D(width(), height(), &nPitch_); 70 | } 71 | 72 | ImagePacked(const ImagePacked &rImage): Image(rImage) 73 | , aPixels_(0) 74 | , nPitch_(rImage.pitch()) 75 | { 76 | aPixels_ = A::Malloc2D(width(), height(), &nPitch_); 77 | A::Copy2D(aPixels_, nPitch_, rImage.pixels(), rImage.pitch(), width(), height()); 78 | } 79 | 80 | virtual 81 | ~ImagePacked() 82 | { 83 | A::Free2D(aPixels_); 84 | } 85 | 86 | ImagePacked & 87 | operator= (const ImagePacked &rImage) 88 | { 89 | // in case of self-assignment 90 | if (&rImage == this) 91 | { 92 | return *this; 93 | } 94 | 95 | A::Free2D(aPixels_); 96 | aPixels_ = 0; 97 | nPitch_ = 0; 98 | 99 | // assign parent class's data fields (width, height) 100 | Image::operator =(rImage); 101 | 102 | aPixels_ = A::Malloc2D(width(), height(), &nPitch_); 103 | A::Copy2D(aPixels_, nPitch_, rImage.data(), rImage.pitch(), width(), height()); 104 | 105 | return *this; 106 | } 107 | 108 | unsigned int 109 | pitch() 110 | const 111 | { 112 | return nPitch_; 113 | } 114 | 115 | /// Get a pointer to the pixel array. 116 | /// The result pointer can be offset to pixel at position (x, y) and 117 | /// even negative offsets are allowed. 118 | /// \param nX Horizontal pointer/array offset. 119 | /// \param nY Vertical pointer/array offset. 120 | /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY). 121 | tPixel * 122 | pixels(int nX = 0, int nY = 0) 123 | { 124 | return reinterpret_cast(reinterpret_cast(aPixels_) + nY * pitch() + nX * gnChannels * sizeof(D)); 125 | } 126 | 127 | const 128 | tPixel * 129 | pixels(int nX = 0, int nY = 0) 130 | const 131 | { 132 | return reinterpret_cast(reinterpret_cast(aPixels_) + nY * pitch() + nX * gnChannels * sizeof(D)); 133 | } 134 | 135 | D * 136 | data(int nX = 0, int nY = 0) 137 | { 138 | return reinterpret_cast(pixels(nX, nY)); 139 | } 140 | 141 | const 142 | D * 143 | data(int nX = 0, int nY = 0) 144 | const 145 | { 146 | return reinterpret_cast(pixels(nX, nY)); 147 | } 148 | 149 | void 150 | swap(ImagePacked &rImage) 151 | { 152 | Image::swap(rImage); 153 | 154 | tData *aTemp = aPixels_; 155 | aPixels_ = rImage.aPixels_; 156 | rImage.aPixels_ = aTemp; 157 | 158 | unsigned int nTemp = nPitch_; 159 | nPitch_ = rImage.nPitch_; 160 | rImage.nPitch_ = nTemp; 161 | } 162 | 163 | private: 164 | D *aPixels_; 165 | unsigned int nPitch_; 166 | }; 167 | 168 | } // npp namespace 169 | 170 | 171 | #endif // NV_IMAGE_IPP_H 172 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/exception.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /* CUda UTility Library */ 29 | #ifndef COMMON_EXCEPTION_H_ 30 | #define COMMON_EXCEPTION_H_ 31 | 32 | // includes, system 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | //! Exception wrapper. 40 | //! @param Std_Exception Exception out of namespace std for easy typing. 41 | template 42 | class Exception : public Std_Exception { 43 | public: 44 | //! @brief Static construction interface 45 | //! @return Alwayss throws ( Located_Exception) 46 | //! @param file file in which the Exception occurs 47 | //! @param line line in which the Exception occurs 48 | //! @param detailed details on the code fragment causing the Exception 49 | static void throw_it(const char *file, const int line, 50 | const char *detailed = "-"); 51 | 52 | //! Static construction interface 53 | //! @return Alwayss throws ( Located_Exception) 54 | //! @param file file in which the Exception occurs 55 | //! @param line line in which the Exception occurs 56 | //! @param detailed details on the code fragment causing the Exception 57 | static void throw_it(const char *file, const int line, 58 | const std::string &detailed); 59 | 60 | //! Destructor 61 | virtual ~Exception() throw(); 62 | 63 | private: 64 | //! Constructor, default (private) 65 | Exception(); 66 | 67 | //! Constructor, standard 68 | //! @param str string returned by what() 69 | explicit Exception(const std::string &str); 70 | }; 71 | 72 | //////////////////////////////////////////////////////////////////////////////// 73 | //! Exception handler function for arbitrary exceptions 74 | //! @param ex exception to handle 75 | //////////////////////////////////////////////////////////////////////////////// 76 | template 77 | inline void handleException(const Exception_Typ &ex) { 78 | std::cerr << ex.what() << std::endl; 79 | 80 | exit(EXIT_FAILURE); 81 | } 82 | 83 | //! Convenience macros 84 | 85 | //! Exception caused by dynamic program behavior, e.g. file does not exist 86 | #define RUNTIME_EXCEPTION(msg) \ 87 | Exception::throw_it(__FILE__, __LINE__, msg) 88 | 89 | //! Logic exception in program, e.g. an assert failed 90 | #define LOGIC_EXCEPTION(msg) \ 91 | Exception::throw_it(__FILE__, __LINE__, msg) 92 | 93 | //! Out of range exception 94 | #define RANGE_EXCEPTION(msg) \ 95 | Exception::throw_it(__FILE__, __LINE__, msg) 96 | 97 | //////////////////////////////////////////////////////////////////////////////// 98 | //! Implementation 99 | 100 | // includes, system 101 | #include 102 | 103 | //////////////////////////////////////////////////////////////////////////////// 104 | //! Static construction interface. 105 | //! @param Exception causing code fragment (file and line) and detailed infos. 106 | //////////////////////////////////////////////////////////////////////////////// 107 | /*static*/ template 108 | void Exception::throw_it(const char *file, const int line, 109 | const char *detailed) { 110 | std::stringstream s; 111 | 112 | // Quiet heavy-weight but exceptions are not for 113 | // performance / release versions 114 | s << "Exception in file '" << file << "' in line " << line << "\n" 115 | << "Detailed description: " << detailed << "\n"; 116 | 117 | throw Exception(s.str()); 118 | } 119 | 120 | //////////////////////////////////////////////////////////////////////////////// 121 | //! Static construction interface. 122 | //! @param Exception causing code fragment (file and line) and detailed infos. 123 | //////////////////////////////////////////////////////////////////////////////// 124 | /*static*/ template 125 | void Exception::throw_it(const char *file, const int line, 126 | const std::string &msg) { 127 | throw_it(file, line, msg.c_str()); 128 | } 129 | 130 | //////////////////////////////////////////////////////////////////////////////// 131 | //! Constructor, default (private). 132 | //////////////////////////////////////////////////////////////////////////////// 133 | template 134 | Exception::Exception() : Std_Exception("Unknown Exception.\n") {} 135 | 136 | //////////////////////////////////////////////////////////////////////////////// 137 | //! Constructor, standard (private). 138 | //! String returned by what(). 139 | //////////////////////////////////////////////////////////////////////////////// 140 | template 141 | Exception::Exception(const std::string &s) : Std_Exception(s) {} 142 | 143 | //////////////////////////////////////////////////////////////////////////////// 144 | //! Destructor 145 | //////////////////////////////////////////////////////////////////////////////// 146 | template 147 | Exception::~Exception() throw() {} 148 | 149 | // functions, exported 150 | 151 | #endif // COMMON_EXCEPTION_H_ 152 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/dynlink_d3d11.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | //-------------------------------------------------------------------------------------- 29 | // File: dynlink_d3d11.h 30 | // 31 | // Shortcut macros and functions for using DX objects 32 | // 33 | // Copyright (c) Microsoft Corporation. All rights reserved 34 | //-------------------------------------------------------------------------------------- 35 | 36 | #ifndef _DYNLINK_D3D11_H_ 37 | #define _DYNLINK_D3D11_H_ 38 | 39 | // Standard Windows includes 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include // for InitCommonControls() 46 | #include // for ExtractIcon() 47 | #include // for placement new 48 | #include 49 | #include 50 | #include 51 | #include 52 | 53 | // CRT's memory leak detection 54 | #if defined(DEBUG) || defined(_DEBUG) 55 | #include 56 | #endif 57 | 58 | // Direct3D10 includes 59 | #include 60 | #include 61 | // #include <..\Samples\C++\Effects11\Inc\d3dx11effect.h> 62 | 63 | // XInput includes 64 | #include 65 | 66 | // strsafe.h deprecates old unsecure string functions. If you 67 | // really do not want to it to (not recommended), then uncomment the next line 68 | //#define STRSAFE_NO_DEPRECATE 69 | 70 | #ifndef STRSAFE_NO_DEPRECATE 71 | #pragma deprecated("strncpy") 72 | #pragma deprecated("wcsncpy") 73 | #pragma deprecated("_tcsncpy") 74 | #pragma deprecated("wcsncat") 75 | #pragma deprecated("strncat") 76 | #pragma deprecated("_tcsncat") 77 | #endif 78 | 79 | #pragma warning( disable : 4996 ) // disable deprecated warning 80 | #include 81 | #pragma warning( default : 4996 ) 82 | 83 | typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **); 84 | typedef HRESULT(WINAPI *LPD3D11CREATEDEVICEANDSWAPCHAIN)(__in_opt IDXGIAdapter *pAdapter, D3D_DRIVER_TYPE DriverType, HMODULE Software, UINT Flags, __in_ecount_opt(FeatureLevels) CONST D3D_FEATURE_LEVEL *pFeatureLevels, UINT FeatureLevels, UINT SDKVersion, __in_opt CONST DXGI_SWAP_CHAIN_DESC *pSwapChainDesc, __out_opt IDXGISwapChain **ppSwapChain, __out_opt ID3D11Device **ppDevice, __out_opt D3D_FEATURE_LEVEL *pFeatureLevel, __out_opt ID3D11DeviceContext **ppImmediateContext); 85 | typedef HRESULT(WINAPI *LPD3D11CREATEDEVICE)(IDXGIAdapter *, D3D_DRIVER_TYPE, HMODULE, UINT32, D3D_FEATURE_LEVEL *, UINT, UINT32, ID3D11Device **, D3D_FEATURE_LEVEL *, ID3D11DeviceContext **); 86 | 87 | static HMODULE s_hModDXGI = NULL; 88 | static LPCREATEDXGIFACTORY sFnPtr_CreateDXGIFactory = NULL; 89 | static HMODULE s_hModD3D11 = NULL; 90 | static LPD3D11CREATEDEVICE sFnPtr_D3D11CreateDevice = NULL; 91 | static LPD3D11CREATEDEVICEANDSWAPCHAIN sFnPtr_D3D11CreateDeviceAndSwapChain = NULL; 92 | 93 | // unload the D3D10 DLLs 94 | static bool dynlinkUnloadD3D11API(void) 95 | { 96 | if (s_hModDXGI) 97 | { 98 | FreeLibrary(s_hModDXGI); 99 | s_hModDXGI = NULL; 100 | } 101 | 102 | if (s_hModD3D11) 103 | { 104 | FreeLibrary(s_hModD3D11); 105 | s_hModD3D11 = NULL; 106 | } 107 | 108 | return true; 109 | } 110 | 111 | // Dynamically load the D3D11 DLLs loaded and map the function pointers 112 | static bool dynlinkLoadD3D11API(void) 113 | { 114 | // If both modules are non-NULL, this function has already been called. Note 115 | // that this doesn't guarantee that all ProcAddresses were found. 116 | if (s_hModD3D11 != NULL && s_hModDXGI != NULL) 117 | { 118 | return true; 119 | } 120 | 121 | #if 1 122 | // This may fail if Direct3D 11 isn't installed 123 | s_hModD3D11 = LoadLibrary("d3d11.dll"); 124 | 125 | if (s_hModD3D11 != NULL) 126 | { 127 | sFnPtr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)GetProcAddress(s_hModD3D11, "D3D11CreateDevice"); 128 | sFnPtr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)GetProcAddress(s_hModD3D11, "D3D11CreateDeviceAndSwapChain"); 129 | } 130 | else 131 | { 132 | printf("\nLoad d3d11.dll failed\n"); 133 | fflush(0); 134 | } 135 | 136 | if (!sFnPtr_CreateDXGIFactory) 137 | { 138 | s_hModDXGI = LoadLibrary("dxgi.dll"); 139 | 140 | if (s_hModDXGI) 141 | { 142 | sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY)GetProcAddress(s_hModDXGI, "CreateDXGIFactory1"); 143 | } 144 | 145 | return (s_hModDXGI != NULL) && (s_hModD3D11 != NULL); 146 | } 147 | 148 | return (s_hModD3D11 != NULL); 149 | #else 150 | sFnPtr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)D3D11CreateDeviceAndSwapChain; 151 | sFnPtr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)D3D11CreateDeviceAndSwapChain; 152 | //sFnPtr_D3DX11CreateEffectFromMemory = ( LPD3DX11CREATEEFFECTFROMMEMORY )D3DX11CreateEffectFromMemory; 153 | sFnPtr_D3DX11CompileFromMemory = (LPD3DX11COMPILEFROMMEMORY)D3DX11CompileFromMemory; 154 | sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY)CreateDXGIFactory; 155 | return true; 156 | #endif 157 | return true; 158 | } 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesNPP.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_IMAGES_NPP_H 30 | #define NV_UTIL_NPP_IMAGES_NPP_H 31 | 32 | #include "Exceptions.h" 33 | #include "ImagePacked.h" 34 | 35 | #include "ImageAllocatorsNPP.h" 36 | #include 37 | 38 | namespace npp 39 | { 40 | // forward declaration 41 | template class ImageCPU; 42 | 43 | template 44 | class ImageNPP: public npp::ImagePacked > 45 | { 46 | public: 47 | ImageNPP() 48 | { 49 | ; 50 | } 51 | 52 | ImageNPP(unsigned int nWidth, unsigned int nHeight, bool bTight = false): ImagePacked >(nWidth, nHeight, bTight) 53 | { 54 | ; 55 | } 56 | 57 | ImageNPP(const npp::Image::Size &rSize): ImagePacked >(rSize) 58 | { 59 | ; 60 | } 61 | 62 | ImageNPP(const ImageNPP &rImage): Image(rImage) 63 | { 64 | ; 65 | } 66 | 67 | template 68 | explicit 69 | ImageNPP(const ImageCPU &rImage, bool bTight = false): ImagePacked >(rImage.width(), rImage.height(), bTight) 70 | { 71 | npp::ImageAllocator::HostToDeviceCopy2D(ImagePacked >::data(), 72 | ImagePacked >::pitch(), 73 | rImage.data(), 74 | rImage.pitch(), 75 | ImagePacked >::width(), 76 | ImagePacked >::height()); 77 | } 78 | 79 | virtual 80 | ~ImageNPP() 81 | { 82 | ; 83 | } 84 | 85 | ImageNPP & 86 | operator= (const ImageNPP &rImage) 87 | { 88 | ImagePacked >::operator= (rImage); 89 | 90 | return *this; 91 | } 92 | 93 | void 94 | copyTo(D *pData, unsigned int nPitch) 95 | const 96 | { 97 | NPP_ASSERT((ImagePacked >::width() * sizeof(npp::Pixel) <= nPitch)); 98 | npp::ImageAllocator::DeviceToHostCopy2D(pData, 99 | nPitch, 100 | ImagePacked >::data(), 101 | ImagePacked >::pitch(), 102 | ImagePacked >::width(), 103 | ImagePacked >::height()); 104 | } 105 | 106 | void 107 | copyFrom(D *pData, unsigned int nPitch) 108 | { 109 | NPP_ASSERT((ImagePacked >::width() * sizeof(npp::Pixel) <= nPitch)); 110 | npp::ImageAllocator::HostToDeviceCopy2D(ImagePacked >::data(), 111 | ImagePacked >::pitch(), 112 | pData, 113 | nPitch, 114 | ImagePacked >::width(), 115 | ImagePacked >::height()); 116 | } 117 | }; 118 | 119 | typedef ImageNPP ImageNPP_8u_C1; 120 | typedef ImageNPP ImageNPP_8u_C2; 121 | typedef ImageNPP ImageNPP_8u_C3; 122 | typedef ImageNPP ImageNPP_8u_C4; 123 | 124 | typedef ImageNPP ImageNPP_16u_C1; 125 | typedef ImageNPP ImageNPP_16u_C2; 126 | typedef ImageNPP ImageNPP_16u_C3; 127 | typedef ImageNPP ImageNPP_16u_C4; 128 | 129 | typedef ImageNPP ImageNPP_16s_C1; 130 | typedef ImageNPP ImageNPP_16s_C3; 131 | typedef ImageNPP ImageNPP_16s_C4; 132 | 133 | typedef ImageNPP ImageNPP_32s_C1; 134 | typedef ImageNPP ImageNPP_32s_C3; 135 | typedef ImageNPP ImageNPP_32s_C4; 136 | 137 | typedef ImageNPP ImageNPP_32f_C1; 138 | typedef ImageNPP ImageNPP_32f_C2; 139 | typedef ImageNPP ImageNPP_32f_C3; 140 | typedef ImageNPP ImageNPP_32f_C4; 141 | 142 | typedef ImageNPP ImageNPP_64f_C1; 143 | typedef ImageNPP ImageNPP_64f_C2; 144 | typedef ImageNPP ImageNPP_64f_C3; 145 | typedef ImageNPP ImageNPP_64f_C4; 146 | 147 | } // npp namespace 148 | 149 | #endif // NV_UTIL_NPP_IMAGES_NPP_H 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ** Please note this repository is archived and no more actively maintained ** 2 | 3 | # N-Ways to Multi-GPU Programming 4 | 5 | This repository contains mini applications for GPU Bootcamps. This bootcamp focuses on multi-GPU programming models. 6 | 7 | Scaling applications to multiple GPUs across multiple nodes requires one to be adept at not just the programming models and optimization techniques, but also at performing root-cause analysis using in-depth profiling to identify and minimize bottlenecks. In this bootcamp, participants will learn to improve the performance of an application step-by-step, taking cues from profilers along the way. Moreover, understanding of the underlying technologies and communication topology will help us utilize high-performance NVIDIA libraries to extract more performance out of the system. 8 | 9 | ## Bootcamp Outline 10 | 11 | * Overview of single-GPU code and Nsight Systems Profiler 12 | * Single Node Multi-GPU: 13 | - CUDA Memcpy and Peer-to-Peer Memory Access 14 | - Intra-node topology 15 | - CUDA Streams and Events 16 | * Multi-Node Multi-GPU: 17 | - Introduction to MPI and Multi-Node execution overview 18 | - MPI with CUDA Memcpy 19 | - CUDA-aware MPI 20 | - Supplemental: Configuring MPI in a containerized environment 21 | * NVIDIA Collectives Communications Library (NCCL) 22 | * NVHSMEM Library 23 | 24 | ## Prerequisites 25 | 26 | This bootcamp requires a multi-node system with multiple GPUs in each node (atleast 2 GPUs/ node). 27 | 28 | ## Tutorial Duration 29 | 30 | The total bootcamp material would take approximately 8 hours . 31 | 32 | ### Using NVIDIA HPC SDK 33 | 34 | A multi-node installation of [NVIDIA's HPC SDK](https://developer.nvidia.com/hpc-sdk) is desired. Refer to [NVIDIA HPC SDK Installation Guide](https://docs.nvidia.com/hpc-sdk/hpc-sdk-install-guide/index.html) for detailed instructions. Ensure that your installation contains HPCX with UCX. 35 | 36 | After installation, make sure to add HPC SDK to the environment as follows(For example the PATH highlighted below is for HPC SDK 21.5): 37 | 38 | ```bash 39 | # Add HPC-SDK to PATH: 40 | export PATH="/Linux_x86_64/21.5/compilers/bin:/Linux_x86_64/21.5/cuda/bin:$PATH" 41 | # Add HPC-SDK to LD_LIBRARY_PATH: 42 | export LD_LIBRARY_PATH="/Linux_x86_64/21.5/comm_libs/nvshmem/lib:/Linux_x86_64/21.5/comm_libs/nccl/lib:/Linux_x86_64/21.5/comm_libs/mpi/lib:/Linux_x86_64/21.5/math_libs/lib64:/Linux_x86_64/21.5/compilers/lib:/Linux_x86_64/21.5/cuda/extras/CUPTI/lib64:>/Linux_x86_64/21.5/cuda/lib64:$LD_LIBRARY_PATH" 43 | #ADD NVSHMEM HOME DIRECTORY PATH 44 | export CUDA_HOME=/Linux_x86_64/21.5/cuda 45 | export NVSHMEM_HOME=/Linux_x86_64/21.5/comm_libs/nvshmem 46 | ``` 47 | **Note:** If you don't use Slurm workload manager, remove `--with-slurm` flag. 48 | 49 | Then, install OpenMPI as follows: 50 | 51 | ```bash 52 | # Download and extract OpenMPI Tarfile 53 | wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz 54 | tar -xvzf openmpi-4.1.1.tar.gz 55 | cd openmpi-4.1.1/ 56 | mkdir -p build 57 | # Configure OpenMPI 58 | ./configure --prefix=$PWD/build --with-libevent=internal --with-xpmem --with-cuda=/Linux_x86_64/21.5/cuda/ --with-slurm --enable-mpi1-compatibility --with-verbs --with-hcoll=/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/hcoll/lib --with-ucx=/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/ucx/ 59 | # Install OpenMPI 60 | make all install 61 | ``` 62 | 63 | Now, add OpenMPI to the environment: 64 | 65 | ```bash 66 | export PATH="/build/bin/:$PATH" 67 | export LD_LIBRARY_PATH="/build/bin` directory. 71 | 72 | ### Without Using NVIDIA HPC SDK 73 | 74 | Multi-node compatible versions of the following are required: 75 | 76 | * [OpenMPI](https://www.open-mpi.org/) 77 | * [HPCX](https://developer.nvidia.com/networking/hpc-x) 78 | * [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) 79 | * [NCCL](https://developer.nvidia.com/nccl) 80 | * [NVSHMEM](https://developer.nvidia.com/nvshmem) 81 | 82 | ## Testing 83 | 84 | We have tested all the codes with CUDA drivers 460.32.03 with CUDA 11.3.0.0, OpenMPI 4.1.1, HPCX 2.8.1, Singularity 3.6.1, NCCL 2.9.9.1, and NVSHMEM 2.1.2. Note that OpenMPI in our cluster was compiled with CUDA, HCOLL, and UCX support. 85 | 86 | ## Running Jupyter Lab 87 | 88 | As this bootcamp covers multi-node CUDA-aware MPI concepts, it is primarily designed to run without any containers. After the prerequisite softwares have been installed, follow these steps to install and run Jupyter Lab: 89 | 90 | ```bash 91 | # Install Anaconda3 92 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 93 | bash Miniconda3-latest-Linux-x86_64.sh -b -p 94 | # Add conda to PATH 95 | export PATH=$PATH:/bin/ 96 | # Install Jupyter Lab 97 | conda install -c conda-forge jupyterlab 98 | # Run Jupyter Lab 99 | jupyter lab --notebook-dir=/hpc/multi_gpu_nways/labs/ --port=8000 --ip=0.0.0.0 --no-browser --NotebookApp.token="" 100 | ``` 101 | 102 | After running Jupyter Lab, open [http://localhost:8888](http://localhost:8888/) in a web browser and start the `introduction.ipynb` notebook. 103 | 104 | ## Optional: Containerized Build with Singularity 105 | 106 | This material is designed to primarily run in containerless environments, that is, directly on the cluster. Thus, building the Singularity container is OPTIONAL. 107 | 108 | If containerization is desired, follow the steps outlined in the notebook [MPI in Containerized Environments](labs/CFD/English/C/jupyter_notebook/mpi/containers_and_mpi.ipynb). 109 | 110 | Follow the steps below to build the Singularity container image and run Jupyter Lab: 111 | 112 | ```bash 113 | # Build the container 114 | singularity build multi_gpu_nways.simg Singularity 115 | # Run Jupyter Lab 116 | singularity run --nv multi_gpu_nways.simg jupyter lab --notebook-dir=/hpc/multi_gpu_nways/labs/ --port=8000 --ip=0.0.0.0 --no-browser --NotebookApp.token="" 117 | ``` 118 | 119 | Then, access Jupyter Lab on [http://localhost:8888](http://localhost:8888/). 120 | 121 | 122 | ## Known issues 123 | 124 | #### Compiler throws errors 125 | 126 | If compiling any program throws an error related to CUDA/ NCCL/ NVHSMEM/ MPI libraries or header files being not found, ensure that `LD_LIBRARY_PATH` is correctly set. Moreover, make sure environment variables `CUDA_HOME`, `NCCL_HOME`, and `NVSHMEM_HOME` are set either during installation or manually inside each `Makefile`. 127 | 128 | - Please go through the list of exisiting bugs/issues or file a new issue at [Github](https://github.com/gpuhackathons-org/gpubootcamp/issues). 129 | 130 | 131 | ## Questions? 132 | 133 | Please join [OpenACC Slack Channel](https://openacclang.slack.com/messages/openaccusergroup) to raise questions. 134 | 135 | If you observe any errors or issues, please file an issue on [GPUBootcamp GitHuB repository](https://github.com/gpuhackathons-org/gpubootcamp). 136 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/nvrtc_helper.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef COMMON_NVRTC_HELPER_H_ 29 | 30 | #define COMMON_NVRTC_HELPER_H_ 1 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #define NVRTC_SAFE_CALL(Name, x) \ 41 | do { \ 42 | nvrtcResult result = x; \ 43 | if (result != NVRTC_SUCCESS) { \ 44 | std::cerr << "\nerror: " << Name << " failed with error " \ 45 | << nvrtcGetErrorString(result); \ 46 | exit(1); \ 47 | } \ 48 | } while (0) 49 | 50 | void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResult, 51 | size_t *cubinResultSize, int requiresCGheaders) { 52 | std::ifstream inputFile(filename, 53 | std::ios::in | std::ios::binary | std::ios::ate); 54 | 55 | if (!inputFile.is_open()) { 56 | std::cerr << "\nerror: unable to open " << filename << " for reading!\n"; 57 | exit(1); 58 | } 59 | 60 | std::streampos pos = inputFile.tellg(); 61 | size_t inputSize = (size_t)pos; 62 | char *memBlock = new char[inputSize + 1]; 63 | 64 | inputFile.seekg(0, std::ios::beg); 65 | inputFile.read(memBlock, inputSize); 66 | inputFile.close(); 67 | memBlock[inputSize] = '\x0'; 68 | 69 | int numCompileOptions = 0; 70 | 71 | char *compileParams[2]; 72 | 73 | int major = 0, minor = 0; 74 | char deviceName[256]; 75 | 76 | // Picks the best CUDA device available 77 | CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv); 78 | 79 | // get compute capabilities and the devicename 80 | checkCudaErrors(cuDeviceGetAttribute( 81 | &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); 82 | checkCudaErrors(cuDeviceGetAttribute( 83 | &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); 84 | 85 | { 86 | // Compile cubin for the GPU arch on which are going to run cuda kernel. 87 | std::string compileOptions; 88 | compileOptions = "--gpu-architecture=sm_"; 89 | 90 | compileParams[numCompileOptions] = reinterpret_cast( 91 | malloc(sizeof(char) * (compileOptions.length() + 10))); 92 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 93 | sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 10), 94 | "%s%d%d", compileOptions.c_str(), major, minor); 95 | #else 96 | snprintf(compileParams[numCompileOptions], compileOptions.size() + 10, "%s%d%d", 97 | compileOptions.c_str(), major, minor); 98 | #endif 99 | } 100 | 101 | numCompileOptions++; 102 | 103 | if (requiresCGheaders) { 104 | std::string compileOptions; 105 | char HeaderNames[256]; 106 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 107 | sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h"); 108 | #else 109 | snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h"); 110 | #endif 111 | 112 | compileOptions = "--include-path="; 113 | 114 | std::string path = sdkFindFilePath(HeaderNames, argv[0]); 115 | if (!path.empty()) { 116 | std::size_t found = path.find(HeaderNames); 117 | path.erase(found); 118 | } else { 119 | printf( 120 | "\nCooperativeGroups headers not found, please install it in %s " 121 | "sample directory..\n Exiting..\n", 122 | argv[0]); 123 | } 124 | compileOptions += path.c_str(); 125 | compileParams[numCompileOptions] = reinterpret_cast( 126 | malloc(sizeof(char) * (compileOptions.length() + 1))); 127 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 128 | sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 1), 129 | "%s", compileOptions.c_str()); 130 | #else 131 | snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s", 132 | compileOptions.c_str()); 133 | #endif 134 | numCompileOptions++; 135 | } 136 | 137 | // compile 138 | nvrtcProgram prog; 139 | NVRTC_SAFE_CALL("nvrtcCreateProgram", 140 | nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL)); 141 | 142 | nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams); 143 | 144 | // dump log 145 | size_t logSize; 146 | NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", 147 | nvrtcGetProgramLogSize(prog, &logSize)); 148 | char *log = reinterpret_cast(malloc(sizeof(char) * logSize + 1)); 149 | NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log)); 150 | log[logSize] = '\x0'; 151 | 152 | if (strlen(log) >= 2) { 153 | std::cerr << "\n compilation log ---\n"; 154 | std::cerr << log; 155 | std::cerr << "\n end log ---\n"; 156 | } 157 | 158 | free(log); 159 | 160 | NVRTC_SAFE_CALL("nvrtcCompileProgram", res); 161 | 162 | size_t codeSize; 163 | NVRTC_SAFE_CALL("nvrtcGetCUBINSize", nvrtcGetCUBINSize(prog, &codeSize)); 164 | char *code = new char[codeSize]; 165 | NVRTC_SAFE_CALL("nvrtcGetCUBIN", nvrtcGetCUBIN(prog, code)); 166 | *cubinResult = code; 167 | *cubinResultSize = codeSize; 168 | 169 | for (int i = 0; i < numCompileOptions; i++) { 170 | free(compileParams[i]); 171 | } 172 | } 173 | 174 | CUmodule loadCUBIN(char *cubin, int argc, char **argv) { 175 | CUmodule module; 176 | CUcontext context; 177 | int major = 0, minor = 0; 178 | char deviceName[256]; 179 | 180 | // Picks the best CUDA device available 181 | CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv); 182 | 183 | // get compute capabilities and the devicename 184 | checkCudaErrors(cuDeviceGetAttribute( 185 | &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); 186 | checkCudaErrors(cuDeviceGetAttribute( 187 | &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); 188 | checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice)); 189 | printf("> GPU Device has SM %d.%d compute capability\n", major, minor); 190 | 191 | checkCudaErrors(cuInit(0)); 192 | checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); 193 | 194 | checkCudaErrors(cuModuleLoadData(&module, cubin)); 195 | free(cubin); 196 | 197 | return module; 198 | } 199 | 200 | #endif // COMMON_NVRTC_HELPER_H_ 201 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/single_gpu/jacobi.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include 37 | #include 38 | 39 | #define BLOCK_DIM_X 32 40 | #define BLOCK_DIM_Y 32 41 | 42 | #define CUDA_RT_CALL(call) \ 43 | { \ 44 | cudaError_t cudaStatus = call; \ 45 | if (cudaSuccess != cudaStatus) \ 46 | fprintf(stderr, \ 47 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 48 | "with " \ 49 | "%s (%d).\n", \ 50 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 51 | } 52 | 53 | constexpr float tol = 1.0e-8; 54 | 55 | const float PI = 2.0 * std::asin(1.0); 56 | 57 | __global__ void initialize_boundaries(float* a_new, float* a, const float pi, const int offset, 58 | const int nx, const int my_ny, const int ny) { 59 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 60 | const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 61 | a[iy * nx + 0] = y0; 62 | a[iy * nx + (nx - 1)] = y0; 63 | a_new[iy * nx + 0] = y0; 64 | a_new[iy * nx + (nx - 1)] = y0; 65 | } 66 | } 67 | 68 | __global__ void jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start, 69 | const int iy_end, const int nx) { 70 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 71 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 72 | __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y]; 73 | unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x; 74 | 75 | if (iy < iy_end && ix < (nx - 1)) { 76 | // Update grid point 77 | const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 78 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 79 | a_new[iy * nx + ix] = new_val; 80 | float residue = new_val - a[iy * nx + ix]; 81 | // Set block-level L2 norm value for this grid point 82 | block_l2_sum[thread_index] = residue * residue; 83 | } 84 | else { 85 | block_l2_sum[thread_index] = 0; 86 | } 87 | // Reduce L2 norm for the block in parallel 88 | for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) { 89 | __syncthreads(); 90 | if ((thread_index) % (2*stride) == 0) { 91 | block_l2_sum[thread_index] += block_l2_sum[thread_index + stride]; 92 | } 93 | } 94 | // Atomically update global L2 norm with block-reduced L2 norm 95 | if (thread_index == 0) { 96 | atomicAdd(l2_norm, block_l2_sum[0]); 97 | } 98 | } 99 | 100 | int get_argval(char** begin, char** end, const std::string& arg, const int default_val) { 101 | int argval = default_val; 102 | char** itr = std::find(begin, end, arg); 103 | if (itr != end && ++itr != end) { 104 | std::istringstream inbuf(*itr); 105 | inbuf >> argval; 106 | } 107 | return argval; 108 | } 109 | 110 | double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h); 111 | 112 | int main(int argc, char* argv[]) { 113 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 114 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 115 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 116 | 117 | CUDA_RT_CALL(cudaSetDevice(0)); 118 | CUDA_RT_CALL(cudaFree(0)); 119 | 120 | float* a_ref_h; 121 | CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float))); 122 | 123 | double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h); 124 | 125 | printf("%dx%d: 1 GPU: %8.4f s\n", nx, ny, runtime_serial); 126 | 127 | return 0; 128 | } 129 | 130 | double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) { 131 | float* a; 132 | float* a_new; 133 | 134 | float* l2_norm_d; 135 | float* l2_norm_h; 136 | 137 | int iy_start = 1; 138 | int iy_end = (ny - 1); 139 | 140 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float))); 141 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float))); 142 | 143 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float))); 144 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float))); 145 | 146 | // Set diriclet boundary conditions on left and right boarder 147 | nvtxRangePush("Init boundaries"); 148 | initialize_boundaries<<>>(a, a_new, PI, 0, nx, ny, ny); 149 | CUDA_RT_CALL(cudaGetLastError()); 150 | CUDA_RT_CALL(cudaDeviceSynchronize()); 151 | nvtxRangePop(); 152 | 153 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float))); 154 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float))); 155 | 156 | CUDA_RT_CALL(cudaDeviceSynchronize()); 157 | 158 | printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny); 159 | 160 | dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1); 161 | dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1); 162 | 163 | int iter = 0; 164 | float l2_norm = 1.0; 165 | 166 | double start = omp_get_wtime(); 167 | nvtxRangePush("Jacobi Solve"); 168 | while (l2_norm > tol && iter < iter_max) { 169 | CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float))); 170 | 171 | // Compute grid points for this iteration 172 | jacobi_kernel<<>>(a_new, a, l2_norm_d, iy_start, iy_end, nx); 173 | CUDA_RT_CALL(cudaGetLastError()); 174 | CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost)); 175 | 176 | // Apply periodic boundary conditions 177 | 178 | CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float), 179 | cudaMemcpyDeviceToDevice)); 180 | CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float), 181 | cudaMemcpyDeviceToDevice)); 182 | 183 | CUDA_RT_CALL(cudaDeviceSynchronize()); 184 | l2_norm = *l2_norm_h; 185 | l2_norm = std::sqrt(l2_norm); 186 | 187 | iter++; 188 | if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); 189 | 190 | std::swap(a_new, a); 191 | } 192 | nvtxRangePop(); 193 | double stop = omp_get_wtime(); 194 | 195 | CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost)); 196 | 197 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 198 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 199 | 200 | CUDA_RT_CALL(cudaFree(a_new)); 201 | CUDA_RT_CALL(cudaFree(a)); 202 | return (stop - start); 203 | } 204 | 205 | -------------------------------------------------------------------------------- /labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Exceptions.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_EXCEPTIONS_H 29 | #define NV_UTIL_NPP_EXCEPTIONS_H 30 | 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | /// All npp related C++ classes are put into the npp namespace. 37 | namespace npp 38 | { 39 | 40 | /// Exception base class. 41 | /// This exception base class will be used for everything C++ throught 42 | /// the NPP project. 43 | /// The exception contains a string message, as well as data fields for a string 44 | /// containing the name of the file as well as the line number where the exception was thrown. 45 | /// The easiest way of throwing exceptions and providing filename and line number is 46 | /// to use one of the ASSERT macros defined for that purpose. 47 | class Exception 48 | { 49 | public: 50 | /// Constructor. 51 | /// \param rMessage A message with information as to why the exception was thrown. 52 | /// \param rFileName The name of the file where the exception was thrown. 53 | /// \param nLineNumber Line number in the file where the exception was thrown. 54 | explicit 55 | Exception(const std::string &rMessage = "", const std::string &rFileName = "", unsigned int nLineNumber = 0) 56 | : sMessage_(rMessage), sFileName_(rFileName), nLineNumber_(nLineNumber) 57 | { }; 58 | 59 | Exception(const Exception &rException) 60 | : sMessage_(rException.sMessage_), sFileName_(rException.sFileName_), nLineNumber_(rException.nLineNumber_) 61 | { }; 62 | 63 | virtual 64 | ~Exception() 65 | { }; 66 | 67 | /// Get the exception's message. 68 | const 69 | std::string & 70 | message() 71 | const 72 | { 73 | return sMessage_; 74 | } 75 | 76 | /// Get the exception's file info. 77 | const 78 | std::string & 79 | fileName() 80 | const 81 | { 82 | return sFileName_; 83 | } 84 | 85 | /// Get the exceptions's line info. 86 | unsigned int 87 | lineNumber() 88 | const 89 | { 90 | return nLineNumber_; 91 | } 92 | 93 | 94 | /// Create a clone of this exception. 95 | /// This creates a new Exception object on the heap. It is 96 | /// the responsibility of the user of this function to free this memory 97 | /// (delete x). 98 | virtual 99 | Exception * 100 | clone() 101 | const 102 | { 103 | return new Exception(*this); 104 | } 105 | 106 | /// Create a single string with all the exceptions information. 107 | /// The virtual toString() method is used by the operator<<() 108 | /// so that all exceptions derived from this base-class can print 109 | /// their full information correctly even if a reference to their 110 | /// exact type is not had at the time of printing (i.e. the basic 111 | /// operator<<() is used). 112 | virtual 113 | std::string 114 | toString() 115 | const 116 | { 117 | std::ostringstream oOutputString; 118 | oOutputString << fileName() << ":" << lineNumber() << ": " << message(); 119 | return oOutputString.str(); 120 | } 121 | 122 | private: 123 | std::string sMessage_; ///< Message regarding the cause of the exception. 124 | std::string sFileName_; ///< Name of the file where the exception was thrown. 125 | unsigned int nLineNumber_; ///< Line number in the file where the exception was thrown 126 | }; 127 | 128 | /// Output stream inserter for Exception. 129 | /// \param rOutputStream The stream the exception information is written to. 130 | /// \param rException The exception that's being written. 131 | /// \return Reference to the output stream being used. 132 | std::ostream & 133 | operator << (std::ostream &rOutputStream, const Exception &rException) 134 | { 135 | rOutputStream << rException.toString(); 136 | return rOutputStream; 137 | } 138 | 139 | /// Basic assert macro. 140 | /// This macro should be used to enforce any kind of pre or post conditions. 141 | /// Unlike the C-runtime assert macro, this macro does not abort execution, but throws 142 | /// a C++ exception. The exception is automatically filled with information about the failing 143 | /// condition, the filename and line number where the exception was thrown. 144 | /// \note The macro is written in such a way that omitting a semicolon after its usage 145 | /// causes a compiler error. The correct way to invoke this macro is: 146 | /// NPP_ASSERT(n < MAX); 147 | #define NPP_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " assertion faild!", __FILE__, __LINE__);} while(false) 148 | 149 | // ASSERT macro. 150 | // Same functionality as the basic assert macro with the added ability to pass 151 | // a message M. M should be a string literal. 152 | // Note: Never use code inside ASSERT() that causes a side-effect ASSERT macros may get compiled 153 | // out in release mode. 154 | #define NPP_ASSERT_MSG(C, M) do {if (!(C)) throw npp::Exception(#C " assertion faild! Message: " M, __FILE__, __LINE__);} while(false) 155 | 156 | #ifdef _DEBUG 157 | /// Basic debug assert macro. 158 | /// This macro is identical in every respect to NPP_ASSERT(C) but it does get compiled to a 159 | /// no-op in release builds. It is therefor of utmost importance to not put statements into 160 | /// this macro that cause side effects required for correct program execution. 161 | #define NPP_DEBUG_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " debug assertion faild!", __FILE__, __LINE__);} while(false) 162 | #else 163 | #define NPP_DEBUG_ASSERT(C) 164 | #endif 165 | 166 | /// ASSERT for null-pointer test. 167 | /// It is safe to put code with side effects into this macro. Also: This macro never 168 | /// gets compiled to a no-op because resource allocation may fail based on external causes not under 169 | /// control of a software developer. 170 | #define NPP_ASSERT_NOT_NULL(P) do {if ((P) == 0) throw npp::Exception(#P " not null assertion faild!", __FILE__, __LINE__);} while(false) 171 | 172 | /// Macro for flagging methods as not implemented. 173 | /// The macro throws an exception with a message that an implementation was missing 174 | #define NPP_NOT_IMPLEMENTED() do {throw npp::Exception("Implementation missing!", __FILE__, __LINE__);} while(false) 175 | 176 | /// Macro for checking error return code of CUDA (runtime) calls. 177 | /// This macro never gets disabled. 178 | #define NPP_CHECK_CUDA(S) do {cudaError_t eCUDAResult; \ 179 | eCUDAResult = S; \ 180 | if (eCUDAResult != cudaSuccess) std::cout << "NPP_CHECK_CUDA - eCUDAResult = " << eCUDAResult << std::endl; \ 181 | NPP_ASSERT(eCUDAResult == cudaSuccess);} while (false) 182 | 183 | /// Macro for checking error return code for NPP calls. 184 | #define NPP_CHECK_NPP(S) do {NppStatus eStatusNPP; \ 185 | eStatusNPP = S; \ 186 | if (eStatusNPP != NPP_SUCCESS) std::cout << "NPP_CHECK_NPP - eStatusNPP = " << _cudaGetErrorEnum(eStatusNPP) << "("<< eStatusNPP << ")" << std::endl; \ 187 | NPP_ASSERT(eStatusNPP == NPP_SUCCESS);} while (false) 188 | 189 | /// Macro for checking error return codes from cuFFT calls. 190 | #define NPP_CHECK_CUFFT(S) do {cufftResult eCUFFTResult; \ 191 | eCUFFTResult = S; \ 192 | if (eCUFFTResult != NPP_SUCCESS) std::cout << "NPP_CHECK_CUFFT - eCUFFTResult = " << eCUFFTResult << std::endl; \ 193 | NPP_ASSERT(eCUFFTResult == CUFFT_SUCCESS);} while (false) 194 | 195 | } // npp namespace 196 | 197 | #endif // NV_UTIL_NPP_EXCEPTIONS_H 198 | -------------------------------------------------------------------------------- /slurm_pmi_config/include/slurm_errno.h: -------------------------------------------------------------------------------- 1 | /*****************************************************************************\ 2 | * slurm_errno.h - error codes and functions for slurm 3 | ****************************************************************************** 4 | * Copyright (C) 2002-2007 The Regents of the University of California. 5 | * Copyright (C) 2008-2009 Lawrence Livermore National Security. 6 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 7 | * Written by Kevin Tew , 8 | * Jim Garlick , et. al. 9 | * CODE-OCEC-09-009. All rights reserved. 10 | * 11 | * This file is part of Slurm, a resource management program. 12 | * For details, see . 13 | * Please also read the included file: DISCLAIMER. 14 | * 15 | * Slurm is free software; you can redistribute it and/or modify it under 16 | * the terms of the GNU General Public License as published by the Free 17 | * Software Foundation; either version 2 of the License, or (at your option) 18 | * any later version. 19 | * 20 | * In addition, as a special exception, the copyright holders give permission 21 | * to link the code of portions of this program with the OpenSSL library under 22 | * certain conditions as described in each individual source file, and 23 | * distribute linked combinations including the two. You must obey the GNU 24 | * General Public License in all respects for all of the code used other than 25 | * OpenSSL. If you modify file(s) with this exception, you may extend this 26 | * exception to your version of the file(s), but you are not obligated to do 27 | * so. If you do not wish to do so, delete this exception statement from your 28 | * version. If you delete this exception statement from all source files in 29 | * the program, then also delete it here. 30 | * 31 | * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY 32 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 33 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 34 | * details. 35 | * 36 | * You should have received a copy of the GNU General Public License along 37 | * with Slurm; if not, write to the Free Software Foundation, Inc., 38 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 39 | \*****************************************************************************/ 40 | #ifndef _SLURM_ERRNO_H 41 | #define _SLURM_ERRNO_H 42 | 43 | #ifdef __cplusplus 44 | extern "C" { 45 | #endif 46 | 47 | #include 48 | 49 | /* set errno to the specified value - then return -1 */ 50 | #define slurm_seterrno_ret(errnum) do { \ 51 | slurm_seterrno(errnum); \ 52 | return (errnum ? -1 : 0); \ 53 | } while (0) 54 | 55 | /* general return codes */ 56 | #define SLURM_SUCCESS 0 57 | #define SLURM_ERROR -1 58 | 59 | enum { 60 | /* General Message error codes */ 61 | SLURM_UNEXPECTED_MSG_ERROR = 1000, 62 | SLURM_COMMUNICATIONS_CONNECTION_ERROR, 63 | SLURM_COMMUNICATIONS_SEND_ERROR, 64 | SLURM_COMMUNICATIONS_RECEIVE_ERROR, 65 | SLURM_COMMUNICATIONS_SHUTDOWN_ERROR, 66 | SLURM_PROTOCOL_VERSION_ERROR, 67 | SLURM_PROTOCOL_IO_STREAM_VERSION_ERROR, 68 | SLURM_PROTOCOL_AUTHENTICATION_ERROR, 69 | SLURM_PROTOCOL_INSANE_MSG_LENGTH, 70 | SLURM_MPI_PLUGIN_NAME_INVALID, 71 | SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED, 72 | SLURM_PLUGIN_NAME_INVALID, 73 | SLURM_UNKNOWN_FORWARD_ADDR, 74 | 75 | /* communication failures to/from slurmctld */ 76 | SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR = 1800, 77 | SLURMCTLD_COMMUNICATIONS_SEND_ERROR, 78 | SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR, 79 | SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR, 80 | 81 | /* _info.c/communication layer RESPONSE_SLURM_RC message codes */ 82 | SLURM_NO_CHANGE_IN_DATA = 1900, 83 | 84 | /* slurmctld error codes */ 85 | ESLURM_INVALID_PARTITION_NAME = 2000, 86 | ESLURM_DEFAULT_PARTITION_NOT_SET, 87 | ESLURM_ACCESS_DENIED, 88 | ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP, 89 | ESLURM_REQUESTED_NODES_NOT_IN_PARTITION, 90 | ESLURM_TOO_MANY_REQUESTED_CPUS, 91 | ESLURM_INVALID_NODE_COUNT, 92 | ESLURM_ERROR_ON_DESC_TO_RECORD_COPY, 93 | ESLURM_JOB_MISSING_SIZE_SPECIFICATION, 94 | ESLURM_JOB_SCRIPT_MISSING, 95 | ESLURM_USER_ID_MISSING = 2010, 96 | ESLURM_DUPLICATE_JOB_ID, 97 | ESLURM_PATHNAME_TOO_LONG, 98 | ESLURM_NOT_TOP_PRIORITY, 99 | ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE, 100 | ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE, 101 | ESLURM_NODES_BUSY, 102 | ESLURM_INVALID_JOB_ID, 103 | ESLURM_INVALID_NODE_NAME, 104 | ESLURM_WRITING_TO_FILE, 105 | ESLURM_TRANSITION_STATE_NO_UPDATE = 2020, 106 | ESLURM_ALREADY_DONE, 107 | ESLURM_INTERCONNECT_FAILURE, 108 | ESLURM_BAD_DIST, 109 | ESLURM_JOB_PENDING, 110 | ESLURM_BAD_TASK_COUNT, 111 | ESLURM_INVALID_JOB_CREDENTIAL, 112 | ESLURM_IN_STANDBY_MODE, 113 | ESLURM_INVALID_NODE_STATE, 114 | ESLURM_INVALID_FEATURE, 115 | ESLURM_INVALID_AUTHTYPE_CHANGE = 2030, 116 | ESLURM_ACTIVE_FEATURE_NOT_SUBSET, 117 | ESLURM_INVALID_SCHEDTYPE_CHANGE, 118 | ESLURM_INVALID_SELECTTYPE_CHANGE, 119 | ESLURM_INVALID_SWITCHTYPE_CHANGE, 120 | ESLURM_FRAGMENTATION, 121 | ESLURM_NOT_SUPPORTED, 122 | ESLURM_DISABLED, 123 | ESLURM_DEPENDENCY, 124 | ESLURM_BATCH_ONLY, 125 | ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED = 2040, 126 | ESLURM_TASKDIST_REQUIRES_OVERCOMMIT, 127 | ESLURM_JOB_HELD, 128 | ESLURM_INVALID_CRED_TYPE_CHANGE, 129 | ESLURM_INVALID_TASK_MEMORY, 130 | ESLURM_INVALID_ACCOUNT, 131 | ESLURM_INVALID_PARENT_ACCOUNT, 132 | ESLURM_SAME_PARENT_ACCOUNT, 133 | ESLURM_INVALID_LICENSES, 134 | ESLURM_NEED_RESTART, 135 | ESLURM_ACCOUNTING_POLICY = 2050, 136 | ESLURM_INVALID_TIME_LIMIT, 137 | ESLURM_RESERVATION_ACCESS, 138 | ESLURM_RESERVATION_INVALID, 139 | ESLURM_INVALID_TIME_VALUE, 140 | ESLURM_RESERVATION_BUSY, 141 | ESLURM_RESERVATION_NOT_USABLE, 142 | ESLURM_INVALID_WCKEY, 143 | ESLURM_RESERVATION_OVERLAP, 144 | ESLURM_PORTS_BUSY, 145 | ESLURM_PORTS_INVALID = 2060, 146 | ESLURM_PROLOG_RUNNING, 147 | ESLURM_NO_STEPS, 148 | ESLURM_INVALID_BLOCK_STATE, 149 | ESLURM_INVALID_BLOCK_LAYOUT, 150 | ESLURM_INVALID_BLOCK_NAME, 151 | ESLURM_INVALID_QOS, 152 | ESLURM_QOS_PREEMPTION_LOOP, 153 | ESLURM_NODE_NOT_AVAIL, 154 | ESLURM_INVALID_CPU_COUNT, 155 | ESLURM_PARTITION_NOT_AVAIL = 2070, 156 | ESLURM_CIRCULAR_DEPENDENCY, 157 | ESLURM_INVALID_GRES, 158 | ESLURM_JOB_NOT_PENDING, 159 | ESLURM_QOS_THRES, 160 | ESLURM_PARTITION_IN_USE, 161 | ESLURM_STEP_LIMIT, 162 | ESLURM_JOB_SUSPENDED, 163 | ESLURM_CAN_NOT_START_IMMEDIATELY, 164 | ESLURM_INTERCONNECT_BUSY, 165 | ESLURM_RESERVATION_EMPTY = 2080, 166 | ESLURM_INVALID_ARRAY, 167 | ESLURM_RESERVATION_NAME_DUP, 168 | ESLURM_JOB_STARTED, 169 | ESLURM_JOB_FINISHED, 170 | ESLURM_JOB_NOT_RUNNING, 171 | ESLURM_JOB_NOT_PENDING_NOR_RUNNING, 172 | ESLURM_JOB_NOT_SUSPENDED, 173 | ESLURM_JOB_NOT_FINISHED, 174 | ESLURM_TRIGGER_DUP, 175 | ESLURM_INTERNAL = 2090, 176 | ESLURM_INVALID_BURST_BUFFER_CHANGE, 177 | ESLURM_BURST_BUFFER_PERMISSION, 178 | ESLURM_BURST_BUFFER_LIMIT, 179 | ESLURM_INVALID_BURST_BUFFER_REQUEST, 180 | ESLURM_PRIO_RESET_FAIL, 181 | ESLURM_POWER_NOT_AVAIL, 182 | ESLURM_POWER_RESERVED, 183 | ESLURM_INVALID_POWERCAP, 184 | ESLURM_INVALID_MCS_LABEL, 185 | ESLURM_BURST_BUFFER_WAIT = 2100, 186 | ESLURM_PARTITION_DOWN, 187 | ESLURM_DUPLICATE_GRES, 188 | ESLURM_JOB_SETTING_DB_INX, 189 | ESLURM_RSV_ALREADY_STARTED, 190 | ESLURM_SUBMISSIONS_DISABLED, 191 | ESLURM_NOT_HET_JOB, 192 | ESLURM_NOT_HET_JOB_LEADER, 193 | ESLURM_NOT_WHOLE_HET_JOB, 194 | ESLURM_CORE_RESERVATION_UPDATE, 195 | ESLURM_DUPLICATE_STEP_ID = 2110, 196 | ESLURM_INVALID_CORE_CNT, 197 | ESLURM_X11_NOT_AVAIL, 198 | ESLURM_GROUP_ID_MISSING, 199 | ESLURM_BATCH_CONSTRAINT, 200 | ESLURM_INVALID_TRES, 201 | ESLURM_INVALID_TRES_BILLING_WEIGHTS, 202 | ESLURM_INVALID_JOB_DEFAULTS, 203 | ESLURM_RESERVATION_MAINT, 204 | ESLURM_INVALID_GRES_TYPE, 205 | ESLURM_REBOOT_IN_PROGRESS = 2120, 206 | ESLURM_MULTI_KNL_CONSTRAINT, 207 | ESLURM_UNSUPPORTED_GRES, 208 | ESLURM_INVALID_NICE, 209 | ESLURM_INVALID_TIME_MIN_LIMIT, 210 | ESLURM_DEFER, 211 | ESLURM_CONFIGLESS_DISABLED, 212 | ESLURM_ENVIRONMENT_MISSING, 213 | 214 | /* slurmd error codes */ 215 | ESLURMD_PIPE_ERROR_ON_TASK_SPAWN = 4000, 216 | ESLURMD_KILL_TASK_FAILED, 217 | ESLURMD_KILL_JOB_ALREADY_COMPLETE, 218 | ESLURMD_INVALID_ACCT_FREQ, 219 | ESLURMD_INVALID_JOB_CREDENTIAL, 220 | ESLURMD_UID_NOT_FOUND, 221 | ESLURMD_GID_NOT_FOUND, 222 | ESLURMD_CREDENTIAL_EXPIRED, 223 | ESLURMD_CREDENTIAL_REVOKED, 224 | ESLURMD_CREDENTIAL_REPLAYED, 225 | ESLURMD_CREATE_BATCH_DIR_ERROR = 4010, 226 | ESLURMD_MODIFY_BATCH_DIR_ERROR, 227 | ESLURMD_CREATE_BATCH_SCRIPT_ERROR, 228 | ESLURMD_MODIFY_BATCH_SCRIPT_ERROR, 229 | ESLURMD_SETUP_ENVIRONMENT_ERROR, 230 | ESLURMD_SHARED_MEMORY_ERROR, 231 | ESLURMD_SET_UID_OR_GID_ERROR, 232 | ESLURMD_SET_SID_ERROR, 233 | ESLURMD_CANNOT_SPAWN_IO_THREAD, 234 | ESLURMD_FORK_FAILED, 235 | ESLURMD_EXECVE_FAILED = 4020, 236 | ESLURMD_IO_ERROR, 237 | ESLURMD_PROLOG_FAILED, 238 | ESLURMD_EPILOG_FAILED, 239 | ESLURMD_SESSION_KILLED, 240 | ESLURMD_TOOMANYSTEPS, 241 | ESLURMD_STEP_EXISTS, 242 | ESLURMD_JOB_NOTRUNNING, 243 | ESLURMD_STEP_SUSPENDED, 244 | ESLURMD_STEP_NOTSUSPENDED, 245 | ESLURMD_INVALID_SOCKET_NAME_LEN = 4030, 246 | 247 | /* slurmd errors in user batch job */ 248 | ESCRIPT_CHDIR_FAILED = 4100, 249 | ESCRIPT_OPEN_OUTPUT_FAILED, 250 | ESCRIPT_NON_ZERO_RETURN, 251 | 252 | /* socket specific Slurm communications error */ 253 | SLURM_PROTOCOL_SOCKET_IMPL_ZERO_RECV_LENGTH = 5000, 254 | SLURM_PROTOCOL_SOCKET_IMPL_NEGATIVE_RECV_LENGTH, 255 | SLURM_PROTOCOL_SOCKET_IMPL_NOT_ALL_DATA_SENT, 256 | ESLURM_PROTOCOL_INCOMPLETE_PACKET , 257 | SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT , 258 | SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT, 259 | 260 | /* slurm_auth errors */ 261 | ESLURM_AUTH_CRED_INVALID = 6000, 262 | ESLURM_AUTH_FOPEN_ERROR, 263 | ESLURM_AUTH_NET_ERROR, 264 | ESLURM_AUTH_UNABLE_TO_SIGN, 265 | ESLURM_AUTH_BADARG, 266 | ESLURM_AUTH_MEMORY, 267 | ESLURM_AUTH_INVALID, 268 | ESLURM_AUTH_UNPACK, 269 | 270 | /* accounting errors */ 271 | ESLURM_DB_CONNECTION = 7000, 272 | ESLURM_JOBS_RUNNING_ON_ASSOC, 273 | ESLURM_CLUSTER_DELETED, 274 | ESLURM_ONE_CHANGE, 275 | ESLURM_BAD_NAME, 276 | ESLURM_OVER_ALLOCATE, 277 | ESLURM_RESULT_TOO_LARGE, 278 | ESLURM_DB_QUERY_TOO_WIDE, 279 | 280 | /* Federation Errors */ 281 | ESLURM_FED_CLUSTER_MAX_CNT = 7100, 282 | ESLURM_FED_CLUSTER_MULTIPLE_ASSIGNMENT, 283 | ESLURM_INVALID_CLUSTER_FEATURE, 284 | ESLURM_JOB_NOT_FEDERATED, 285 | ESLURM_INVALID_CLUSTER_NAME, 286 | ESLURM_FED_JOB_LOCK, 287 | ESLURM_FED_NO_VALID_CLUSTERS, 288 | 289 | /* plugin and custom errors */ 290 | ESLURM_MISSING_TIME_LIMIT = 8000, 291 | ESLURM_INVALID_KNL 292 | }; 293 | 294 | /* look up an errno value */ 295 | char * slurm_strerror(int errnum); 296 | 297 | /* set an errno value */ 298 | void slurm_seterrno(int errnum); 299 | 300 | /* get an errno value */ 301 | int slurm_get_errno(void); 302 | 303 | /* print message: error string for current errno value */ 304 | void slurm_perror(const char *msg); 305 | 306 | #ifdef __cplusplus 307 | } 308 | #endif 309 | 310 | #endif /* !_SLURM_ERRNO_H */ 311 | -------------------------------------------------------------------------------- /labs/CFD/English/C/jupyter_notebook/mpi/multi_node_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Learning objectives\n", 8 | "\n", 9 | "In this lab we will learn about:\n", 10 | "\n", 11 | "* Multi-node Multi-GPU programming and importance of inter-process communication frameworks.\n", 12 | "* Introduction to MPI specification and APIs.\n", 13 | "* Execution of Hello World MPI binary on single as well as multiple nodes.\n", 14 | "\n", 15 | "# Multi-Node Multi-GPU Programming\n", 16 | "\n", 17 | "As we move from a single node to multiple nodes, the basic multi-GPU programming concepts like domain decomposition and application-specific concepts like halo exchange remain the same. However, the communication becomes complex.\n", 18 | "\n", 19 | "A single process can spawn threads that can be spread within a node (potentially on multiple sockets) but it cannot cross the node boundary. Thus, scalable multi-node programming requires the use of multiple processes.\n", 20 | "\n", 21 | "Inter-process communication is usually done by libraries like OpenMPI. They expose communication APIs, synchronization constructs, etc. to the user. Let us now learn about programming in MPI.\n", 22 | "\n", 23 | "## MPI\n", 24 | "\n", 25 | "MPI is a specification for the developers and users of message passing libraries. By itself, it is not a library - but rather the specification of what such a library should be. An example of MPI-compliant library is OpenMPI.\n", 26 | "\n", 27 | "It primarily addresses the message-passing parallel programming model: data is moved from the address space of one process to that of another process through cooperative operations on each process.\n", 28 | "\n", 29 | "MPI is widely used in practice for HPC applications, in academia, government agencies, and industry alike. In this lab, while we will introduce its APIs, a working understanding of MPI is highly desirable.\n", 30 | "\n", 31 | "### A Hello World Example\n", 32 | "\n", 33 | "A C-based Hello World program is shown below:\n", 34 | "\n", 35 | "```c\n", 36 | "#include \n", 37 | "#include \n", 38 | "\n", 39 | "int main(int argc, char** argv) {\n", 40 | " // Initialize the MPI environment\n", 41 | " MPI_Init(NULL, NULL);\n", 42 | " // Get the number of processes\n", 43 | " int size;\n", 44 | " MPI_Comm_size(MPI_COMM_WORLD, &size);\n", 45 | " // Get the rank of the process\n", 46 | " int rank;\n", 47 | " MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n", 48 | " // Get the name of the processor\n", 49 | " char processor_name[MPI_MAX_PROCESSOR_NAME];\n", 50 | " int name_len;\n", 51 | " MPI_Get_processor_name(processor_name, &name_len);\n", 52 | " // Print a hello world message\n", 53 | " printf(\"Hello world from processor %s, rank %d out of %d processors\\n\",\n", 54 | " processor_name, rank, size);\n", 55 | " // Finalize the MPI environment.\n", 56 | " MPI_Finalize();\n", 57 | "}\n", 58 | "```\n", 59 | "\n", 60 | "To access the program, open the [hello_world.c](../../source_code/mpi/hello_world.c) file. Alternatively, you can navigate to `CFD/English/C/source_code/mpi/` directory in Jupyter's file browser in the left pane. Then, click to open the `hello_world.c` file.\n", 61 | "\n", 62 | "The MPI environment is initialized with `MPI_Init` through which all of MPI’s global and internal variables are constructed. A \"communicator\" is created between all processes that are spawned, and unique ranks are assigned to each process. \n", 63 | "\n", 64 | "`MPI_Comm_size` returns the size of a communicator, that is, the number of processes within that communicator. In our example, this call will return the number of processes requested for the job.\n", 65 | "\n", 66 | "`MPI_Comm_rank` returns the rank of a process in a communicator. Each process inside of a communicator is assigned an incremental rank starting from zero. The ranks of the processes are primarily used for identification purposes when sending and receiving messages.\n", 67 | "\n", 68 | "`MPI_Get_processor_name` obtains the name of the processor on which the process is executing and `MPI_Finalize` is used to clean up the MPI environment. No more MPI calls can be made after this call.\n", 69 | "\n", 70 | "## Running MPI with or without containers\n", 71 | "\n", 72 | "**We will run MPI directly on compute nodes without using containers.** The subsequent sections assume that atleast 2 compute nodes with multiple GPUs in each node are available to the user. All our codes have been tested with CUDA-aware OpenMPI v4.1.1 with supporting libraries HPCX v2.8.1 (for UCX and HCOLL) and CUDA v11.3.0.0 on DGX-1 8 Tesla V100 compute nodes. \n", 73 | "\n", 74 | "CUDA-awareness as a concept in MPI will be explained in subsequent labs.\n", 75 | "\n", 76 | "Usually, a cluster workload manager like Slurm or PBS is present and integrated with MPI installation to launch multi-node jobs. We use `mpirun` command to run MPI assuming that the user is logged into an interactive shell with multiple nodes allocated. The other common way is to use workload manager commands like `srun` (for Slurm) directly to run MPI jobs as they are integrated with MPI internally. \n", 77 | "\n", 78 | "**Note:** We do outline the method to build and run containerized MPI using Singularity in tandem with host MPI implementation in our supplemental notebook: [MPI in a containerized environment](./containers_and_mpi.ipynb). \n", 79 | "\n", 80 | "### Compilation\n", 81 | "\n", 82 | "The `mpicc` and `mpic++` (or `mpicxx`) compilers are used to compile and link programs with MPI. We can compile the Hello World program with the command:\n", 83 | "\n", 84 | "```bash\n", 85 | "mpicc -o hello_world hello_world.c\n", 86 | "```\n", 87 | "\n", 88 | "Ensure that MPI is installed (for exmaple, if it is built from source) and available (for example, if loaded as a module) using the folllowing command:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "! mpirun --version" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Now, let us compile the program:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "! cd ../../source_code/mpi && make clean && make hello_world" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### Execution\n", 121 | "\n", 122 | "We run the program using the `mpirun` command as follows:\n", 123 | "\n", 124 | "```bash\n", 125 | "mpirun -np -npersocket -hostfile ./hello_world\n", 126 | "```\n", 127 | "\n", 128 | "The `-np` option specifies the total number of processes spawned by MPI runtime and `-npersocket` option specifies the number of processes to be spawned on each socket. The `-hostfile` option allows us to specify which hosts (compute nodes) to start MPI processes on. The file is a newline-separated list of hostnames which must be accessible to each other so that MPI processes can communicate.\n", 129 | "\n", 130 | "Note that DGX-1V is a dual-socket system and `` should be less than or equal to number of cores in that socket. Clearly, ``$\\div$(`procs_per_socket`$\\times$``) is the number of nodes used. There are several other options available to specify `` that will be discussed in subsequent labs. As we are using an OpenMPI implementation in a workload manager-based environment, the `` will be provided by Slurm and we don't need to specify this option.\n", 131 | "\n", 132 | "There are numerous other configuration options that one can overview using the `mpirun --help` command. You can check the number of sockets and cores per socket in your machine (the whole node) with the command `lscpu | grep -E 'Socket|Core'`. \n", 133 | "\n", 134 | "### Single Node\n", 135 | "\n", 136 | "Run the program binary on a single node:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "! cd ../../source_code/mpi && mpirun -np 2 -npersocket 1 ./hello_world" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "You may see some warnings. As long as the output is printed, you can ignore the warnings. In the output, you should see 2 unique ranks ranging (0 and 1) and the node's name like below:\n", 153 | "\n", 154 | "```bash\n", 155 | "Hello world from processor , rank 0 out of 2 processors\n", 156 | "Hello world from processor , rank 1 out of 2 processors\n", 157 | "```\n", 158 | "\n", 159 | "### Multiple Nodes\n", 160 | "\n", 161 | "Let us now run the Hello World program on 2 nodes with the following command:" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "! cd ../../source_code/mpi && mpirun -np 4 -npersocket 1 ./hello_world" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "The output, excluding warnings, should be as follows (the order of output lines is not important):\n", 178 | "\n", 179 | "```bash\n", 180 | "Hello world from processor , rank 1 out of 4 processors\n", 181 | "Hello world from processor , rank 0 out of 4 processors\n", 182 | "Hello world from processor , rank 3 out of 4 processors\n", 183 | "Hello world from processor , rank 2 out of 4 processors\n", 184 | "```\n", 185 | "\n", 186 | "**Note:** Subsequent labs will assume the reader understands how to run a multi-node MPI job.\n", 187 | "\n", 188 | "Now, let us learn more MPI concepts and code a CUDA Memcpy and MPI-based Jacobi solver. Click below to move to the next lab:\n", 189 | "\n", 190 | "# [Next: CUDA Memcpy with MPI](../mpi/memcpy.ipynb)\n", 191 | "\n", 192 | "Here's a link to the home notebook through which all other notebooks are accessible:\n", 193 | "\n", 194 | "# [HOME](../../../start_here.ipynb)\n", 195 | "\n", 196 | "---\n", 197 | "## Links and Resources\n", 198 | "\n", 199 | "* [Programming: MPI Hello World Tutorial](https://mpitutorial.com/tutorials/mpi-hello-world/)\n", 200 | "* [Programming: OpenMPI Library](https://www.open-mpi.org/)\n", 201 | "* [Concepts: Singularity Containers with MPI](https://sylabs.io/guides/3.6/user-guide/mpi.html)\n", 202 | "* [Documentation: mpirun Command](https://www.open-mpi.org/doc/current/man1/mpirun.1.php)\n", 203 | "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n", 204 | "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n", 205 | "\n", 206 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 207 | "\n", 208 | "## Licensing\n", 209 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply.\n" 210 | ] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.7.4" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 5 234 | } 235 | --------------------------------------------------------------------------------