├── slurm_pmi_config
    ├── lib
    │   └── .gitkeep
    └── include
    │   ├── smd_ns.h
    │   └── slurm_errno.h
├── labs
    └── CFD
    │   ├── English
    │       ├── C
    │       │   ├── source_code
    │       │   │   ├── mpi
    │       │   │   │   ├── .gitkeep
    │       │   │   │   ├── hello_world.c
    │       │   │   │   ├── containerization
    │       │   │   │   │   ├── Makefile
    │       │   │   │   │   └── jacobi_kernels.cu
    │       │   │   │   ├── Makefile
    │       │   │   │   └── jacobi_kernels.cu
    │       │   │   ├── nccl
    │       │   │   │   ├── .gitkeep
    │       │   │   │   ├── Makefile
    │       │   │   │   └── jacobi_kernels.cu
    │       │   │   ├── nvshmem
    │       │   │   │   ├── .gitkeep
    │       │   │   │   ├── Makefile
    │       │   │   │   └── left_shift.cu
    │       │   │   ├── p2pBandwidthLatencyTest
    │       │   │   │   ├── Common
    │       │   │   │   │   ├── GL
    │       │   │   │   │   │   ├── freeglut.h
    │       │   │   │   │   │   └── freeglut_ext.h
    │       │   │   │   │   ├── rendercheck_d3d11.h
    │       │   │   │   │   ├── UtilNPP
    │       │   │   │   │   │   ├── SignalAllocatorsCPU.h
    │       │   │   │   │   │   ├── ImageAllocatorsCPU.h
    │       │   │   │   │   │   ├── Pixel.h
    │       │   │   │   │   │   ├── SignalsCPU.h
    │       │   │   │   │   │   ├── SignalsNPP.h
    │       │   │   │   │   │   ├── ImagesCPU.h
    │       │   │   │   │   │   ├── Image.h
    │       │   │   │   │   │   ├── Signal.h
    │       │   │   │   │   │   ├── ImageIO.h
    │       │   │   │   │   │   ├── ImagePacked.h
    │       │   │   │   │   │   ├── ImagesNPP.h
    │       │   │   │   │   │   └── Exceptions.h
    │       │   │   │   │   ├── helper_functions.h
    │       │   │   │   │   ├── helper_multiprocess.h
    │       │   │   │   │   ├── rendercheck_d3d11.cpp
    │       │   │   │   │   ├── helper_cusolver.h
    │       │   │   │   │   ├── exception.h
    │       │   │   │   │   ├── dynlink_d3d11.h
    │       │   │   │   │   └── nvrtc_helper.h
    │       │   │   │   └── Makefile
    │       │   │   ├── cuda
    │       │   │   │   └── Makefile
    │       │   │   └── single_gpu
    │       │   │   │   ├── Makefile
    │       │   │   │   └── jacobi.cu
    │       │   ├── jupyter_notebook
    │       │   │   ├── mpi
    │       │   │   │   ├── .gitkeep
    │       │   │   │   └── multi_node_intro.ipynb
    │       │   │   ├── nccl
    │       │   │   │   └── .gitkeep
    │       │   │   └── nvhsmem
    │       │   │   │   └── .gitkeep
    │       │   └── images
    │       │   │   ├── jacobi_algo.jpg
    │       │   │   ├── git_branching.jpg
    │       │   │   ├── gpudirect_p2p.png
    │       │   │   ├── gpudirect_rdma.png
    │       │   │   ├── halo_exchange.png
    │       │   │   ├── mpi_overview.png
    │       │   │   ├── nsys_overview.png
    │       │   │   ├── memcpy_gpu_util.png
    │       │   │   ├── memcpy_host_staging.png
    │       │   │   ├── memcpy_p2p_overview.png
    │       │   │   ├── memcpy_serialized.png
    │       │   │   ├── mpi_container_setup.png
    │       │   │   ├── mpi_memcpy_overview.png
    │       │   │   ├── nccl_architecture.png
    │       │   │   ├── nccl_dgx1_topology.png
    │       │   │   ├── nvidia_smi_p2p_gpu0.png
    │       │   │   ├── cuda_streams_overview.png
    │       │   │   ├── domain_decomposition.png
    │       │   │   ├── memcpy_util_selection.png
    │       │   │   ├── memcpyasync_parallel.png
    │       │   │   ├── mpi_host_staging_time.png
    │       │   │   ├── mpi_memcpy_large_time.png
    │       │   │   ├── mpi_memcpy_nvtx_stats.png
    │       │   │   ├── nccl_profiler_output.png
    │       │   │   ├── nvshmem_memory_model.png
    │       │   │   ├── open_terminal_session.png
    │       │   │   ├── p2p_2_gpu_memcpy_nsys.png
    │       │   │   ├── dgx1_8x_tesla_v100_topo.png
    │       │   │   ├── gpu_programming_process.png
    │       │   │   ├── intra_node_topology_map.png
    │       │   │   ├── jacobi_memcpy_p2p_report.png
    │       │   │   ├── jupyter_lab_navigation.png
    │       │   │   ├── nsys_cli_sample_output.png
    │       │   │   ├── nsys_single_gpu_analysis.png
    │       │   │   ├── nvidia_smi_topo_output.png
    │       │   │   ├── nvshmem_mpi_comparison.png
    │       │   │   ├── nvshmem_profiler_report.png
    │       │   │   ├── streams_util_selection.png
    │       │   │   ├── mpi_cuda_aware_gdr_latency.png
    │       │   │   ├── mpi_cuda_aware_p2p_metrics.png
    │       │   │   ├── nvshmem_left_shift_output.png
    │       │   │   ├── nvshmem_thread_level_comm.png
    │       │   │   ├── jacobi_memcpy_report_events.png
    │       │   │   ├── jacobi_memcpy_report_overview.png
    │       │   │   ├── mpi_memcpy_halo_exchange_latency.png
    │       │   │   ├── mpi_cuda_aware_halo_exchange_latency.png
    │       │   │   ├── mpi_host_staging_throughput_latency.png
    │       │   │   └── jacobi_memcpy_streams_events_p2p_report.png
    │       ├── Presentations
    │       │   └── README.md
    │       └── start_here.ipynb
    │   └── LICENSE
├── .gitignore
├── Singularity
├── CONTRIBUTING.md
└── README.md


/slurm_pmi_config/lib/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/mpi/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/jupyter_notebook/mpi/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/nccl/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/nvshmem/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/jupyter_notebook/nccl/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/jupyter_notebook/nvhsmem/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/jacobi_algo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_algo.jpg


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/git_branching.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/git_branching.jpg


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/gpudirect_p2p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/gpudirect_p2p.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/gpudirect_rdma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/gpudirect_rdma.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/halo_exchange.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/halo_exchange.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_overview.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nsys_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nsys_overview.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/memcpy_gpu_util.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_gpu_util.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/memcpy_host_staging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_host_staging.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/memcpy_p2p_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_p2p_overview.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/memcpy_serialized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_serialized.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_container_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_container_setup.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_memcpy_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_overview.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nccl_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nccl_architecture.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nccl_dgx1_topology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nccl_dgx1_topology.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/cuda_streams_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/cuda_streams_overview.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/domain_decomposition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/domain_decomposition.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/memcpy_util_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpy_util_selection.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/memcpyasync_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/memcpyasync_parallel.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_host_staging_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_host_staging_time.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_memcpy_large_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_large_time.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nccl_profiler_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nccl_profiler_output.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvshmem_memory_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_memory_model.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/open_terminal_session.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/open_terminal_session.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/gpu_programming_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/gpu_programming_process.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/intra_node_topology_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/intra_node_topology_map.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/jupyter_lab_navigation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jupyter_lab_navigation.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nsys_cli_sample_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nsys_cli_sample_output.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nsys_single_gpu_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nsys_single_gpu_analysis.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvidia_smi_topo_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvidia_smi_topo_output.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvshmem_mpi_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_mpi_comparison.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvshmem_profiler_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_profiler_report.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/streams_util_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/streams_util_selection.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvshmem_left_shift_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_left_shift_output.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/nvshmem_thread_level_comm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/nvshmem_thread_level_comm.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | */.ipynb_checkpoints/*
 3 | alk.traj.dcd
 4 | *.simg
 5 | *.so*
 6 | *.a
 7 | *.la
 8 | mgpm
 9 | *.o
10 | *.out
11 | */.ses/*
12 | */.log/*
13 | 
14 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/jacobi_memcpy_report_events.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_report_events.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png


--------------------------------------------------------------------------------
/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_multi_gpu/main/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png


--------------------------------------------------------------------------------
/labs/CFD/English/Presentations/README.md:
--------------------------------------------------------------------------------
1 | For Partners who are interested in delivering the critical hands-on skills needed to advance science in form of Bootcamp can reach out to us at [GPU Hackathon Partner](https://gpuhackathons.org/partners) website. In addition to current bootcamp material the Partners will be provided with the following:
2 | 
3 | - Presentation: All the Bootcamps are accompanied with training material presentations which can be used during the Bootcamp session.
4 | - Mini challenge : To test the knowledge gained during this Bootcamp a mini application challenge is provided along with sample Solution.
5 | - Additional Support: On case to case basis the Partners can also be trained on how to effectively deliver the Bootcamp with maximal impact.


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut.h:
--------------------------------------------------------------------------------
 1 | #ifndef  __FREEGLUT_H__
 2 | #define  __FREEGLUT_H__
 3 | 
 4 | /*
 5 |  * freeglut.h
 6 |  *
 7 |  * The freeglut library include file
 8 |  *
 9 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
10 |  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
11 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
12 |  * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |  */
16 | 
17 | #include "freeglut_std.h"
18 | #include "freeglut_ext.h"
19 | 
20 | /*** END OF FILE ***/
21 | 
22 | #endif /* __FREEGLUT_H__ */
23 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/mpi/hello_world.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |     // Initialize the MPI environment
 6 |     MPI_Init(NULL, NULL);
 7 | 
 8 |     // Get the number of processes
 9 |     int size;
10 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
11 | 
12 |     // Get the rank of the process
13 |     int rank;
14 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
15 | 
16 |     // Get the name of the processor
17 |     char processor_name[MPI_MAX_PROCESSOR_NAME];
18 |     int name_len;
19 |     MPI_Get_processor_name(processor_name, &name_len);
20 | 
21 |     // Print a hello world message
22 |     printf("Hello world from processor %s, rank %d out of %d processors\n",
23 |            processor_name, rank, size);
24 | 
25 |     // Finalize the MPI environment.
26 |     MPI_Finalize();
27 | }
28 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/nccl/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | MPICXX=mpicxx
 4 | MPIRUN ?= mpirun
 5 | #CUDA_HOME ?= /usr/local/cuda
 6 | #NCCL_HOME ?= /usr/nccl/
 7 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
 8 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
 9 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
10 | 
11 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
12 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -fopenmp -std=c++14
13 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl
14 | 
15 | jacobi_nccl: Makefile jacobi_nccl.cpp jacobi_kernels.o
16 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi_nccl.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_nccl
17 | 
18 | jacobi_kernels.o: Makefile jacobi_kernels.cu
19 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
20 | 
21 | .PHONY.: clean
22 | clean:
23 | 	rm -rf jacobi_nccl jacobi_kernels.o *.qdrep *.sqlite
24 | 
25 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/mpi/containerization/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | MPICXX=mpicxx
 4 | #CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
 5 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
 6 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
 7 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
 8 | 
 9 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
10 | MPICXX_FLAGS = -g -I$(CUDA_HOME)/include  -fopenmp -std=c++14
11 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
12 | 
13 | jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o
14 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi
15 | 
16 | jacobi_kernels.o: Makefile jacobi_kernels.cu
17 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
18 | 
19 | all: jacobi_cuda_aware_mpi
20 | 
21 | .PHONY.: clean
22 | clean:
23 | 	rm -rf jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite
24 | 
25 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/cuda/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
 4 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
 5 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
 6 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
 7 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14
 8 | 
 9 | jacobi_memcpy: jacobi_memcpy.cu
10 | 	$(NVCC) $(NVCC_FLAGS) jacobi_memcpy.cu -o jacobi_memcpy
11 | 
12 | jacobi_streams: jacobi_streams.cu
13 | 	$(NVCC) $(NVCC_FLAGS) jacobi_streams.cu -o jacobi_streams
14 | 
15 | jacobi_streams_events: jacobi_streams_events.cu
16 | 	$(NVCC) $(NVCC_FLAGS) jacobi_streams_events.cu -o jacobi_streams_events
17 | 
18 | all: jacobi_memcpy jacobi_streams jacobi_streams_events
19 | 
20 | .PHONY: clean
21 | clean:
22 | 	rm -f jacobi_memcpy jacobi_streams jacobi_streams_events *.qdrep *.sqlite
23 | 
24 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/nvshmem/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPIRUN ?= mpirun
 5 | CUDA_HOME ?= /usr/local/cuda
 6 | ifndef NVSHMEM_HOME
 7 | $(error NVSHMEM_HOME is not set)
 8 | endif
 9 | ifndef MPI_HOME
10 | $(error MPI_HOME is not set)
11 | endif
12 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
13 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
14 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
15 | 
16 | NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include
17 | NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt
18 | 
19 | left_shift: Makefile left_shift.cu
20 | 	$(NVCC) $(NVCC_FLAGS) left_shift.cu -c -o left_shift.o
21 | 	$(NVCC) $(GENCODE_FLAGS) left_shift.o -o left_shift $(NVCC_LDFLAGS)
22 | 
23 | jacobi_nvshmem: Makefile jacobi_nvshmem.cu
24 | 	$(NVCC) $(NVCC_FLAGS) jacobi_nvshmem.cu -c -o jacobi_nvshmem.o
25 | 	$(NVCC) $(GENCODE_FLAGS) jacobi_nvshmem.o -o jacobi_nvshmem $(NVCC_LDFLAGS)
26 | 
27 | .PHONY.: clean
28 | clean:
29 | 	rm -rf jacobi_nvshmem left_shift *.o *.qdrep *.sqlite
30 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/single_gpu/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | #CUDA_HOME=hpc_sdk_path/Linux_x86_64/21.3/cuda/11.2/
 4 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 5 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 6 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 7 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 8 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 9 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
10 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
11 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.qdrep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/mpi/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | MPICXX=mpicxx
 4 | #CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
 5 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
 6 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
 7 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
 8 | 
 9 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
10 | MPICXX_FLAGS = -g -I$(CUDA_HOME)/include  -fopenmp -std=c++14
11 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
12 | 
13 | hello_world: Makefile hello_world.c
14 | 	$(MPICXX) $(MPICXX_FLAGS) hello_world.c $(LD_FLAGS) -o hello_world
15 | 
16 | jacobi_memcpy_mpi: Makefile jacobi_memcpy_mpi.cpp jacobi_kernels.o
17 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi_memcpy_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_memcpy_mpi
18 | 
19 | jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o
20 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi
21 | 
22 | jacobi_kernels.o: Makefile jacobi_kernels.cu
23 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
24 | 
25 | all: hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi
26 | 
27 | .PHONY.: clean
28 | clean:
29 | 	rm -rf hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite
30 | 
31 | 


--------------------------------------------------------------------------------
/labs/CFD/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/nvshmem/left_shift.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "mpi.h"
 3 | #include "nvshmem.h"
 4 | #include "nvshmemx.h"
 5 | 
 6 | #define CUDA_CHECK(stmt)                                  \
 7 | do {                                                      \
 8 |     cudaError_t result = (stmt);                          \
 9 |     if (cudaSuccess != result) {                          \
10 |         fprintf(stderr, "[%s:%d] CUDA failed with %s \n", \
11 |          __FILE__, __LINE__, cudaGetErrorString(result)); \
12 |         exit(-1);                                         \
13 |     }                                                     \
14 | } while (0)
15 | 
16 | __global__ void simple_shift(int *destination) {
17 |     int mype = nvshmem_my_pe();
18 |     int npes = nvshmem_n_pes();
19 |     int peer = (mype + 1) % npes;
20 | 
21 |     nvshmem_int_p(destination, mype, peer);
22 | }
23 | 
24 | int main (int argc, char *argv[]) {
25 |     int mype_node, msg;
26 |     cudaStream_t stream;
27 |     int rank, nranks;
28 |     MPI_Comm mpi_comm = MPI_COMM_WORLD;
29 |     nvshmemx_init_attr_t attr;
30 | 
31 |     MPI_Init(&argc, &argv);
32 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
33 |     MPI_Comm_size(MPI_COMM_WORLD, &nranks);
34 | 
35 |     attr.mpi_comm = &mpi_comm;
36 |     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
37 |     mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
38 | 
39 |     CUDA_CHECK(cudaSetDevice(mype_node));
40 |     CUDA_CHECK(cudaStreamCreate(&stream));
41 |     int *destination = (int *) nvshmem_malloc (sizeof(int));
42 | 
43 |     simple_shift<<<1, 1, 0, stream>>>(destination);
44 |     nvshmemx_barrier_all_on_stream(stream);
45 |     CUDA_CHECK(cudaMemcpyAsync(&msg, destination, sizeof(int),
46 |                 cudaMemcpyDeviceToHost, stream));
47 | 
48 |     CUDA_CHECK(cudaStreamSynchronize(stream));
49 |     printf("%d: received message %d\n", nvshmem_my_pe(), msg);
50 | 
51 |     nvshmem_free(destination);
52 |     nvshmem_finalize();
53 |     MPI_Finalize();
54 |     return 0;
55 | }


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | #pragma once
29 | 
30 | #ifndef _RENDERCHECK_D3D11_H_
31 | #define _RENDERCHECK_D3D11_H_
32 | 
33 | #include <stdio.h>
34 | #include <stdlib.h>
35 | #include <string.h>
36 | #include <assert.h>
37 | #include <d3d11.h>
38 | 
39 | class CheckRenderD3D11
40 | {
41 |     public:
42 | 
43 |         CheckRenderD3D11() {}
44 | 
45 |         static HRESULT ActiveRenderTargetToPPM(ID3D11Device  *pDevice, const char *zFileName);
46 |         static HRESULT ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName);
47 | 
48 |         static bool PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path,
49 |                              const float epsilon, const float threshold = 0.0f);
50 | };
51 | 
52 | #endif


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalAllocatorsCPU.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | 
29 | #ifndef NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
30 | #define NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
31 | 
32 | #include "Exceptions.h"
33 | 
34 | namespace npp
35 | {
36 | 
37 |     template <typename D>
38 |     class SignalAllocatorCPU
39 |     {
40 |         public:
41 |             static
42 |             D *
43 |             Malloc1D(unsigned int nSize)
44 |             {
45 |                 return new D[nSize];;
46 |             };
47 | 
48 |             static
49 |             void
50 |             Free1D(D *pPixels)
51 |             {
52 |                 delete[] pPixels;
53 |             };
54 | 
55 |             static
56 |             void
57 |             Copy1D(D *pDst, const D *pSrc, size_t nSize)
58 |             {
59 |                 memcpy(pDst, pSrc, nSize * sizeof(D));
60 |             };
61 | 
62 |     };
63 | 
64 | } // npp namespace
65 | 
66 | #endif // NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
67 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_functions.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | // These are helper functions for the SDK samples (string parsing,
29 | // timers, image helpers, etc)
30 | #ifndef COMMON_HELPER_FUNCTIONS_H_
31 | #define COMMON_HELPER_FUNCTIONS_H_
32 | 
33 | #ifdef WIN32
34 | #pragma warning(disable : 4996)
35 | #endif
36 | 
37 | // includes, project
38 | #include <assert.h>
39 | #include <exception.h>
40 | #include <math.h>
41 | #include <stdio.h>
42 | #include <stdlib.h>
43 | 
44 | #include <algorithm>
45 | #include <fstream>
46 | #include <iostream>
47 | #include <string>
48 | #include <vector>
49 | 
50 | // includes, timer, string parsing, image helpers
51 | #include <helper_image.h>  // helper functions for image compare, dump, data comparisons
52 | #include <helper_string.h>  // helper functions for string parsing
53 | #include <helper_timer.h>   // helper functions for timers
54 | 
55 | #ifndef EXIT_WAIVED
56 | #define EXIT_WAIVED 2
57 | #endif
58 | 
59 | #endif  // COMMON_HELPER_FUNCTIONS_H_
60 | 


--------------------------------------------------------------------------------
/Singularity:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | Bootstrap: docker
 4 | FROM: nvcr.io/nvidia/nvhpc:21.5-devel-cuda_multi-ubuntu20.04
 5 | 
 6 | %environment
 7 |     export XDG_RUNTIME_DIR=
 8 |     export PATH="/opt/openmpi/ompi/bin/:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH"
 9 |     export LD_LIBRARY_PATH="/opt/openmpi/ompi/lib:/pmi_utils/lib/:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/lib64/:$LD_LIBRARY_PATH"
10 | 
11 | %post
12 |     build_tmp=$(mktemp -d) && cd ${build_tmp}
13 | 
14 |     apt-get -y update
15 |     apt-get -y dist-upgrade 
16 |     DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
17 | 	    m4 vim-nox emacs-nox nano zip\
18 |  	    python3-pip python3-setuptools git-core inotify-tools \
19 | 	    curl git-lfs \
20 | 	    build-essential libtbb-dev
21 |     rm -rf /var/lib/apt/cache/* 
22 | 
23 |     pip3 install --upgrade pip
24 |     pip3 install --no-cache-dir jupyter
25 |     pip3 install --no-cache-dir jupyterlab
26 |     pip3 install gdown
27 | 
28 |     apt-get install --no-install-recommends -y build-essential 
29 | 
30 | # NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
31 |     apt-get update -y   
32 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget
33 |     apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80
34 |     echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list 
35 |     apt-get update -y 
36 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 
37 |     apt-get install --no-install-recommends -y build-essential
38 | 
39 |     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
40 |     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3 
41 |     rm Miniconda3-latest-Linux-x86_64.sh 
42 | 
43 | # Install CUDA-aware OpenMPI with UCX and PMI
44 |     mkdir -p /opt/openmpi && cd /opt/openmpi
45 |     wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz
46 |     tar -xvzf openmpi-4.1.1.tar.gz
47 |     mkdir -p /opt/openmpi/ompi/
48 |     cd /opt/openmpi/openmpi-4.1.1/
49 |     ./configure --prefix=/opt/openmpi/ompi/ --with-libevent=internal --with-xpmem --with-cuda=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ --with-slurm --with-pmix=internal --with-pmi=/pmi_utils/ --enable-mpi1-compatibility --with-verbs --with-hcoll=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/hcoll/ --with-ucx=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/ucx/
50 |     export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/pmi_utils/lib/"
51 |     make all install
52 |     
53 |     cd /
54 |     rm -rf ${build_tmp}
55 | 
56 | %files
57 |     labs/ /labs
58 |     slurm_pmi_config/ /pmi_utils
59 | 
60 | %runscript
61 |     "$@"
62 | 
63 | %labels
64 |     AUTHOR Anish-Saxena
65 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | #
28 | ################################################################################
29 | #
30 | # Makefile project only supported on Mac OS X and Linux Platforms)
31 | #
32 | ################################################################################
33 | 
34 | # Location of the CUDA Toolkit
35 | 
36 | HOST_COMPILER ?= g++
37 | NVCC          := nvcc -ccbin $(HOST_COMPILER)
38 | 
39 | # internal flags
40 | NVCCFLAGS   :=
41 | CCFLAGS     :=
42 | LDFLAGS     :=
43 | 
44 | SAMPLE_ENABLED := 1
45 | 
46 | # Common includes and paths for CUDA
47 | INCLUDES  := -I./Common
48 | LIBRARIES :=
49 | 
50 | ################################################################################
51 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
52 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
53 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
54 | 
55 | NVCC_FLAGS += -std=c++14
56 | LD_FLAGS += -lcudart
57 | ################################################################################
58 | 
59 | # Target rules
60 | all: build
61 | 
62 | build: p2pBandwidthLatencyTest
63 | 
64 | p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu
65 | 	$(NVCC) $(INCLUDES) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ -c $<
66 | 
67 | p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o
68 | 	$(NVCC) $(LD_FLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
69 | 
70 | clean:
71 | 	rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o
72 | 
73 | clobber: clean
74 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageAllocatorsCPU.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | #ifndef NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
29 | #define NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
30 | 
31 | #include "Exceptions.h"
32 | 
33 | namespace npp
34 | {
35 | 
36 |     template <typename D, size_t N>
37 |     class ImageAllocatorCPU
38 |     {
39 |         public:
40 |             static
41 |             D *
42 |             Malloc2D(unsigned int nWidth, unsigned int nHeight, unsigned int *pPitch)
43 |             {
44 |                 NPP_ASSERT(nWidth * nHeight > 0);
45 | 
46 |                 D *pResult = new D[nWidth * N * nHeight];
47 |                 *pPitch = nWidth * sizeof(D) * N;
48 | 
49 |                 return pResult;
50 |             };
51 | 
52 |             static
53 |             void
54 |             Free2D(D *pPixels)
55 |             {
56 |                 delete[] pPixels;
57 |             };
58 | 
59 |             static
60 |             void
61 |             Copy2D(D *pDst, size_t nDstPitch, const D *pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight)
62 |             {
63 |                 const void *pSrcLine = pSrc;
64 |                 void        *pDstLine = pDst;
65 | 
66 |                 for (size_t iLine = 0; iLine < nHeight; ++iLine)
67 |                 {
68 |                     // copy one line worth of data
69 |                     memcpy(pDst, pSrc, nWidth * N * sizeof(D));
70 |                     // move data pointers to next line
71 |                     pDst += nDstPitch;
72 |                     pSrc += nSrcPitch;
73 |                 }
74 |             };
75 | 
76 |     };
77 | 
78 | } // npp namespace
79 | 
80 | #endif // NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
81 | 


--------------------------------------------------------------------------------
/slurm_pmi_config/include/smd_ns.h:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************\
 2 |  *  smd_ns.h - Library for fault tolerant application support
 3 |  *****************************************************************************
 4 |  *  Copyright (C) 2013-2014 SchedMD LLC
 5 |  *  Written by Morris Jette and David Bigagli (SchedMD LLC)
 6 |  *
 7 |  *  This file is part of Slurm, a resource management program.
 8 |  *  For details, see <https://slurm.schedmd.com/>.
 9 |  *  Please also read the included file: DISCLAIMER.
10 |  *
11 |  *  Slurm is free software; you can redistribute it and/or modify it under
12 |  *  the terms of the GNU General Public License as published by the Free
13 |  *  Software Foundation; either version 2 of the License, or (at your option)
14 |  *  any later version.
15 |  *
16 |  *  In addition, as a special exception, the copyright holders give permission
17 |  *  to link the code of portions of this program with the OpenSSL library under
18 |  *  certain conditions as described in each individual source file, and
19 |  *  distribute linked combinations including the two. You must obey the GNU
20 |  *  General Public License in all respects for all of the code used other than
21 |  *  OpenSSL. If you modify file(s) with this exception, you may extend this
22 |  *  exception to your version of the file(s), but you are not obligated to do
23 |  *  so. If you do not wish to do so, delete this exception statement from your
24 |  *  version.  If you delete this exception statement from all source files in
25 |  *  the program, then also delete it here.
26 |  *
27 |  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28 |  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29 |  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30 |  *  details.
31 |  *
32 |  *  You should have received a copy of the GNU General Public License along
33 |  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
34 |  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35 | \*****************************************************************************/
36 | 
37 | #ifndef _HAVE_SMD_NS_H
38 | #define _HAVE_SMD_NS_H
39 | 
40 | #include <arpa/inet.h>
41 | #include <assert.h>
42 | #include <ctype.h>
43 | #include <errno.h>
44 | #include <getopt.h>
45 | #include <inttypes.h>
46 | #include <netdb.h>
47 | #include <netinet/in.h>
48 | #include <poll.h>
49 | #include <pthread.h>
50 | #include <stdarg.h>
51 | #include <stdbool.h>
52 | #include <stdio.h>
53 | #include <stdlib.h>
54 | #include <string.h>
55 | #include <sys/param.h>
56 | #include <sys/socket.h>
57 | #include <sys/stat.h>
58 | #include <sys/types.h>
59 | #include <sys/time.h>
60 | #include <sys/wait.h>
61 | #include <time.h>
62 | #include <unistd.h>
63 | 
64 | /* Faulty can be in state FAILED or FAILING
65 |  * these flags tell the controller which one
66 |  * the caller is interested in.
67 |  */
68 | #define FAILED_NODES   (1 << 1)
69 | #define FAILING_NODES  (1 << 2)
70 | 
71 | /* These are the events sent from slurm to the client that
72 |  * has registered for any of these events.
73 |  * We use define as user can subscribe to more than one
74 |  * events.
75 |  */
76 | #define	SMD_EVENT_NODE_FAILED  (1 << 1)	/* node has failed */
77 | #define SMD_EVENT_NODE_FAILING (1 << 2)	/* node failing can be drained */
78 | #define	SMD_EVENT_NODE_REPLACE (1 << 3)	/* replacement ready */
79 | 
80 | #endif	/* _HAVE_SMD_NS_H */
81 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Pixel.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_PIXEL_H
 30 | #define NV_UTIL_PIXEL_H
 31 | 
 32 | #include "Exceptions.h"
 33 | 
 34 | namespace npp
 35 | {
 36 |     template <typename D, size_t N>
 37 |     struct Pixel
 38 |     { };
 39 | 
 40 |     template <typename D>
 41 |     struct Pixel<D, 1>
 42 |     {
 43 |         D x;
 44 | 
 45 |         const D &
 46 |         operator[](size_t iChannel)
 47 |         const
 48 |         {
 49 |             NPP_ASSERT(iChannel < 1);
 50 |             return (&x)[iChannel];
 51 |         }
 52 | 
 53 |         D &
 54 |         operator[](size_t iChannel)
 55 |         {
 56 |             NPP_ASSERT(iChannel < 1);
 57 |             return (&x)[iChannel];
 58 |         }
 59 |     };
 60 | 
 61 |     template <typename D>
 62 |     struct Pixel<D, 2>
 63 |     {
 64 |         D x,y;
 65 | 
 66 |         const D &
 67 |         operator[](size_t iChannel)
 68 |         const
 69 |         {
 70 |             NPP_ASSERT(iChannel < 2);
 71 |             return (&x)[iChannel];
 72 |         }
 73 | 
 74 |         D &
 75 |         operator[](size_t iChannel)
 76 |         {
 77 |             NPP_ASSERT(iChannel < 2);
 78 |             return (&x)[iChannel];
 79 |         }
 80 |     };
 81 | 
 82 |     template <typename D>
 83 |     struct Pixel<D, 3>
 84 |     {
 85 |         D x,y,z;
 86 | 
 87 |         const D &
 88 |         operator[](size_t iChannel)
 89 |         const
 90 |         {
 91 |             NPP_ASSERT(iChannel < 3);
 92 |             return (&x)[iChannel];
 93 |         }
 94 | 
 95 |         D &
 96 |         operator[](size_t iChannel)
 97 |         {
 98 |             NPP_ASSERT(iChannel < 3);
 99 |             return (&x)[iChannel];
100 |         }
101 |     };
102 | 
103 |     template <typename D>
104 |     struct Pixel<D, 4>
105 |     {
106 |         D x, y, z, w;
107 | 
108 |         const D &
109 |         operator[](size_t iChannel)
110 |         const
111 |         {
112 |             NPP_ASSERT(iChannel < 4);
113 |             return (&x)[iChannel];
114 |         }
115 | 
116 |         D &
117 |         operator[](size_t iChannel)
118 |         {
119 |             NPP_ASSERT(iChannel < 4);
120 |             return (&x)[iChannel];
121 |         }
122 |     };
123 | 
124 | } // npp namespace
125 | 
126 | #endif // NV_UTIL_PIXEL_H
127 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsCPU.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_SIGNALS_CPU_H
 30 | #define NV_UTIL_NPP_SIGNALS_CPU_H
 31 | 
 32 | #include "Signal.h"
 33 | 
 34 | #include "SignalAllocatorsCPU.h"
 35 | #include "Exceptions.h"
 36 | 
 37 | #include <npp.h>
 38 | 
 39 | 
 40 | namespace npp
 41 | {
 42 | 
 43 |     template<typename D, class A>
 44 |     class SignalCPU: public npp::SignalTemplate<D, A>
 45 |     {
 46 |         public:
 47 |             typedef typename npp::SignalTemplate<D, A>::tData tData;
 48 | 
 49 |             SignalCPU()
 50 |             {
 51 |                 ;
 52 |             }
 53 | 
 54 |             SignalCPU(size_t nSize): SignalTemplate<D, A>(nSize)
 55 |             {
 56 |                 ;
 57 |             }
 58 | 
 59 |             SignalCPU(const SignalCPU<D, A> &rSignal): SignalTemplate<D, A>(rSignal)
 60 |             {
 61 |                 ;
 62 |             }
 63 | 
 64 |             virtual
 65 |             ~SignalCPU()
 66 |             {
 67 |                 ;
 68 |             }
 69 | 
 70 |             SignalCPU &
 71 |             operator= (const SignalCPU<D,A> &rSignal)
 72 |             {
 73 |                 SignalTemplate<D, A>::operator= (rSignal);
 74 | 
 75 |                 return *this;
 76 |             }
 77 | 
 78 |             tData &
 79 |             operator [](unsigned int i)
 80 |             {
 81 |                 return *SignalTemplate<D, A>::values(i);
 82 |             }
 83 | 
 84 |             tData
 85 |             operator [](unsigned int i)
 86 |             const
 87 |             {
 88 |                 return *SignalTemplate<D, A>::values(i);
 89 |             }
 90 | 
 91 |     };
 92 | 
 93 |     typedef SignalCPU<Npp8u,   npp::SignalAllocatorCPU<Npp8u>   >   SignalCPU_8u;
 94 |     typedef SignalCPU<Npp32s,  npp::SignalAllocatorCPU<Npp32s>  >   SignalCPU_32s;
 95 |     typedef SignalCPU<Npp16s,  npp::SignalAllocatorCPU<Npp16s>  >   SignalCPU_16s;
 96 |     typedef SignalCPU<Npp16sc, npp::SignalAllocatorCPU<Npp16sc> >   SignalCPU_16sc;
 97 |     typedef SignalCPU<Npp32sc, npp::SignalAllocatorCPU<Npp32sc> >   SignalCPU_32sc;
 98 |     typedef SignalCPU<Npp32f,  npp::SignalAllocatorCPU<Npp32f>  >   SignalCPU_32f;
 99 |     typedef SignalCPU<Npp32fc, npp::SignalAllocatorCPU<Npp32fc> >   SignalCPU_32fc;
100 |     typedef SignalCPU<Npp64s,  npp::SignalAllocatorCPU<Npp64s>  >   SignalCPU_64s;
101 |     typedef SignalCPU<Npp64sc, npp::SignalAllocatorCPU<Npp64sc> >   SignalCPU_64sc;
102 |     typedef SignalCPU<Npp64f,  npp::SignalAllocatorCPU<Npp64f>  >   SignalCPU_64f;
103 |     typedef SignalCPU<Npp64fc, npp::SignalAllocatorCPU<Npp64fc> >   SignalCPU_64fc;
104 | 
105 | } // npp namespace
106 | 
107 | #endif // NV_UTIL_NPP_SIGNALS_CPU_H
108 | 


--------------------------------------------------------------------------------
/labs/CFD/English/start_here.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Multi-GPU Programming and Performance Analysis\n",
 8 |     "\n",
 9 |     "## Learning objectives\n",
10 |     "\n",
11 |     "Scaling applications to multiple GPUs across multiple nodes requires one to be adept at not just the programming models and optimization techniques, but also at performing root-cause analysis using in-depth profiling to identify and minimize bottlenecks. In this bootcamp, participants will learn to improve the performance of an application step-by-step, taking cues from profilers along the way. Moreover, understanding of the underlying technologies and communication topology will help us utilize high-performance NVIDIA libraries to extract more performance out of the system.\n",
12 |     "\n",
13 |     "By the end of this bootcamp session, participants will be adept at:\n",
14 |     "* Reviewing communication architecture and topology\n",
15 |     "* Developing CUDA-aware multi-node multi-GPU MPI applications\n",
16 |     "* Profiling the application using NVIDIA Nsight Systems\n",
17 |     "* Applying optimizations like CUDA streams, events, and overlapping compute and communication\n",
18 |     "* Understanding GPUDirect technologies like P2P and RDMA\n",
19 |     "* Learning to use NVIDIA NCCL and NVSHMEM libraries\n",
20 |     "\n",
21 |     "### Bootcamp Duration\n",
22 |     "\n",
23 |     "The bootcamp will take 8 hours to complete. Link to download all materials will be available at the end of the lab.\n",
24 |     "\n",
25 |     "### Content Level\n",
26 |     "Intermediate, Advanced\n",
27 |     "\n",
28 |     "### Target Audience and Prerequisites\n",
29 |     "The target audience for this lab are researchers, graduate students, and developers who are interested in scaling their scientific applications to multiple nodes using multi-GPU implementations.\n",
30 |     "\n",
31 |     "Experience in C/ C++ and basic CUDA programming is required. Experience with parallel programming frameworks like OpenMP or MPI is not required but a basic understanding of MPI is highly recommended.\n",
32 |     "\n",
33 |     "### Bootcamp Outline\n",
34 |     "\n",
35 |     "We will take up the Jacobi Solver, an iterative technique for solving system of linear equations, in this tutorial. To begin, click on the first link below:\n",
36 |     "\n",
37 |     "1. [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n",
38 |     "2. Single Node Multi-GPU:\n",
39 |     "    * [CUDA Memcpy and Peer-to-Peer Memory Access](C/jupyter_notebook/cuda/memcpy.ipynb)\n",
40 |     "    * [Intra-node topology](C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb)\n",
41 |     "    * [CUDA Streams and Events](C/jupyter_notebook/cuda/streams.ipynb)\n",
42 |     "3. Multi-Node Multi-GPU:\n",
43 |     "    * [Introduction to MPI and Multi-Node execution overview](C/jupyter_notebook/mpi/multi_node_intro.ipynb)\n",
44 |     "    * [MPI with CUDA Memcpy](C/jupyter_notebook/mpi/memcpy.ipynb)\n",
45 |     "    * [CUDA-aware MPI](C/jupyter_notebook/mpi/cuda_aware.ipynb)\n",
46 |     "    * [Supplemental: Configuring MPI in a containerized environment](C/jupyter_notebook/mpi/containers_and_mpi.ipynb)\n",
47 |     "4. [NVIDIA Collectives Communications Library (NCCL)](C/jupyter_notebook/nccl/nccl.ipynb)\n",
48 |     "5. [NVHSMEM Library](C/jupyter_notebook/nvshmem/nvshmem.ipynb)\n",
49 |     "\n",
50 |     "--- \n",
51 |     "\n",
52 |     "## Licensing \n",
53 |     "\n",
54 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
55 |    ]
56 |   }
57 |  ],
58 |  "metadata": {
59 |   "kernelspec": {
60 |    "display_name": "Python 3",
61 |    "language": "python",
62 |    "name": "python3"
63 |   },
64 |   "language_info": {
65 |    "codemirror_mode": {
66 |     "name": "ipython",
67 |     "version": 3
68 |    },
69 |    "file_extension": ".py",
70 |    "mimetype": "text/x-python",
71 |    "name": "python",
72 |    "nbconvert_exporter": "python",
73 |    "pygments_lexer": "ipython3",
74 |    "version": "3.7.4"
75 |   }
76 |  },
77 |  "nbformat": 4,
78 |  "nbformat_minor": 4
79 | }
80 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_multiprocess.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef HELPER_MULTIPROCESS_H
 29 | #define HELPER_MULTIPROCESS_H
 30 | 
 31 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 32 | #ifndef WIN32_LEAN_AND_MEAN
 33 | #define WIN32_LEAN_AND_MEAN
 34 | #endif
 35 | #include <windows.h>
 36 | #include <iostream>
 37 | #include <stdio.h>
 38 | #include <tchar.h>
 39 | #include <strsafe.h>
 40 | #include <sddl.h>
 41 | #include <aclapi.h>
 42 | #include <winternl.h>
 43 | #else
 44 | #include <stdio.h>
 45 | #include <fcntl.h>
 46 | #include <sys/mman.h>
 47 | #include <unistd.h>
 48 | #include <errno.h>
 49 | #include <sys/wait.h>
 50 | #include <sys/types.h>
 51 | #include <sys/socket.h>
 52 | #include <memory.h>
 53 | #include <sys/un.h>
 54 | #endif
 55 | #include <vector>
 56 | 
 57 | typedef struct sharedMemoryInfo_st {
 58 |     void *addr;
 59 |     size_t size;
 60 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 61 |     HANDLE shmHandle;
 62 | #else
 63 |     int shmFd;
 64 | #endif
 65 | } sharedMemoryInfo;
 66 | 
 67 | int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
 68 | 
 69 | int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
 70 | 
 71 | void sharedMemoryClose(sharedMemoryInfo *info);
 72 | 
 73 | 
 74 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 75 | typedef PROCESS_INFORMATION Process;
 76 | #else
 77 | typedef pid_t Process;
 78 | #endif
 79 | 
 80 | int spawnProcess(Process *process, const char *app, char * const *args);
 81 | 
 82 | int waitProcess(Process *process);
 83 | 
 84 | #define checkIpcErrors(ipcFuncResult) \
 85 |     if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); }
 86 | 
 87 | #if defined(__linux__)
 88 | struct ipcHandle_st {
 89 |     int socket;
 90 |     char *socketName;
 91 | };
 92 | typedef int ShareableHandle;
 93 | #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 94 | struct ipcHandle_st {
 95 |     std::vector<HANDLE> hMailslot; // 1 Handle in case of child and `num children` Handles for parent.
 96 | };
 97 | typedef HANDLE ShareableHandle;
 98 | #endif
 99 | 
100 | typedef struct ipcHandle_st ipcHandle;
101 | 
102 | int
103 | ipcCreateSocket(ipcHandle *&handle, const char *name, const std::vector<Process>& processes);
104 | 
105 | int
106 | ipcOpenSocket(ipcHandle *&handle);
107 | 
108 | int
109 | ipcCloseSocket(ipcHandle *handle);
110 | 
111 | int
112 | ipcRecvShareableHandles(ipcHandle *handle, std::vector<ShareableHandle>& shareableHandles);
113 | 
114 | int
115 | ipcSendShareableHandles(ipcHandle *handle, const std::vector<ShareableHandle>& shareableHandles, const std::vector<Process>& processes);
116 | 
117 | int
118 | ipcCloseShareableHandle(ShareableHandle shHandle);
119 | 
120 | #endif // HELPER_MULTIPROCESS_H
121 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsNPP.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_SIGNALS_NPP_H
 30 | #define NV_UTIL_NPP_SIGNALS_NPP_H
 31 | 
 32 | #include "Exceptions.h"
 33 | #include "Signal.h"
 34 | 
 35 | #include "SignalAllocatorsNPP.h"
 36 | #include <cuda_runtime.h>
 37 | 
 38 | namespace npp
 39 | {
 40 |     // forward declaration
 41 |     template<typename D, class A> class SignalCPU;
 42 | 
 43 |     template<typename D>
 44 |     class SignalNPP: public npp::SignalTemplate<D, npp::SignalAllocator<D> >
 45 |     {
 46 |         public:
 47 |             SignalNPP()
 48 |             {
 49 |                 ;
 50 |             }
 51 | 
 52 |             explicit
 53 |             SignalNPP(size_t nSize): SignalTemplate<D, npp::SignalAllocator<D> >(nSize)
 54 |             {
 55 |                 ;
 56 |             }
 57 | 
 58 |             SignalNPP(const SignalNPP<D> &rSignal): SignalTemplate<D, npp::SignalAllocator<D> >(rSignal)
 59 |             {
 60 |                 ;
 61 |             }
 62 | 
 63 |             template<class X>
 64 |             explicit
 65 |             SignalNPP(const SignalCPU<D, X> &rSignal): SignalTemplate<D, npp::SignalAllocator<D> >(rSignal.size())
 66 |             {
 67 |                 npp::SignalAllocator<D>::HostToDeviceCopy1D(SignalTemplate<D, npp::SignalAllocator<D> >::values(),
 68 |                                                             rSignal.values(), SignalTemplate<D, npp::SignalAllocator<D> >::size());
 69 |             }
 70 | 
 71 |             virtual
 72 |             ~SignalNPP()
 73 |             {
 74 |                 ;
 75 |             }
 76 | 
 77 |             SignalNPP &
 78 |             operator= (const SignalNPP<D> &rSignal)
 79 |             {
 80 |                 SignalTemplate<D, npp::SignalAllocator<D> >::operator= (rSignal);
 81 | 
 82 |                 return *this;
 83 |             }
 84 | 
 85 |             void
 86 |             copyTo(D *pValues)
 87 |             const
 88 |             {
 89 |                 npp::SignalAllocator<D>::DeviceToHostCopy1D(pValues, SignalTemplate<D, npp::SignalAllocator<D> >::values(), SignalTemplate<D, npp::SignalAllocator<D> >::size());
 90 |             }
 91 | 
 92 |             void
 93 |             copyFrom(D *pValues)
 94 |             {
 95 |                 npp::SignalAllocator<D>::HostToDeviceCopy1D(SignalTemplate<D, npp::SignalAllocator<D> >::values(), pValues, SignalTemplate<D, npp::SignalAllocator<D> >::size());
 96 |             }
 97 |     };
 98 | 
 99 |     typedef SignalNPP<Npp8u>    SignalNPP_8u;
100 |     typedef SignalNPP<Npp16s>   SignalNPP_16s;
101 |     typedef SignalNPP<Npp16sc>  SignalNPP_16sc;
102 |     typedef SignalNPP<Npp32s>   SignalNPP_32s;
103 |     typedef SignalNPP<Npp32sc>  SignalNPP_32sc;
104 |     typedef SignalNPP<Npp32f>   SignalNPP_32f;
105 |     typedef SignalNPP<Npp32fc>  SignalNPP_32fc;
106 |     typedef SignalNPP<Npp64s>   SignalNPP_64s;
107 |     typedef SignalNPP<Npp64sc>  SignalNPP_64sc;
108 |     typedef SignalNPP<Npp64f>   SignalNPP_64f;
109 |     typedef SignalNPP<Npp64fc>  SignalNPP_64fc;
110 | 
111 | } // npp namespace
112 | 
113 | #endif // NV_UTIL_NPP_SIGNALS_NPP_H
114 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_ext.h:
--------------------------------------------------------------------------------
  1 | #ifndef  __FREEGLUT_EXT_H__
  2 | #define  __FREEGLUT_EXT_H__
  3 | 
  4 | /*
  5 |  * freeglut_ext.h
  6 |  *
  7 |  * The non-GLUT-compatible extensions to the freeglut library include file
  8 |  *
  9 |  * Copyright (c) 1999-2000 Pawel W. Olszta. All Rights Reserved.
 10 |  * Written by Pawel W. Olszta, <olszta@sourceforge.net>
 11 |  * Creation date: Thu Dec 2 1999
 12 |  *
 13 |  * Permission is hereby granted, free of charge, to any person obtaining a
 14 |  * copy of this software and associated documentation files (the "Software"),
 15 |  * to deal in the Software without restriction, including without limitation
 16 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 17 |  * and/or sell copies of the Software, and to permit persons to whom the
 18 |  * Software is furnished to do so, subject to the following conditions:
 19 |  *
 20 |  * The above copyright notice and this permission notice shall be included
 21 |  * in all copies or substantial portions of the Software.
 22 |  *
 23 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 24 |  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 25 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 26 |  * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 27 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 28 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 29 |  */
 30 | 
 31 | #ifdef __cplusplus
 32 | extern "C" {
 33 | #endif
 34 | 
 35 | /*
 36 |  * GLUT API Extension macro definitions -- behaviour when the user clicks on an "x" to close a window
 37 |  */
 38 | #define GLUT_ACTION_EXIT                         0
 39 | #define GLUT_ACTION_GLUTMAINLOOP_RETURNS         1
 40 | #define GLUT_ACTION_CONTINUE_EXECUTION           2
 41 | 
 42 | /*
 43 |  * Create a new rendering context when the user opens a new window?
 44 |  */
 45 | #define GLUT_CREATE_NEW_CONTEXT                  0
 46 | #define GLUT_USE_CURRENT_CONTEXT                 1
 47 | 
 48 | /*
 49 |  * GLUT API Extension macro definitions -- the glutGet parameters
 50 |  */
 51 | #define  GLUT_ACTION_ON_WINDOW_CLOSE        0x01F9
 52 | 
 53 | #define  GLUT_WINDOW_BORDER_WIDTH           0x01FA
 54 | #define  GLUT_WINDOW_HEADER_HEIGHT          0x01FB
 55 | 
 56 | #define  GLUT_VERSION                       0x01FC
 57 | 
 58 | #define  GLUT_RENDERING_CONTEXT             0x01FD
 59 | 
 60 | /*
 61 |  * Process loop function, see freeglut_main.c
 62 |  */
 63 | FGAPI void    FGAPIENTRY glutMainLoopEvent(void);
 64 | FGAPI void    FGAPIENTRY glutLeaveMainLoop(void);
 65 | 
 66 | /*
 67 |  * Window-specific callback functions, see freeglut_callbacks.c
 68 |  */
 69 | FGAPI void    FGAPIENTRY glutMouseWheelFunc(void (* callback)(int, int, int, int));
 70 | FGAPI void    FGAPIENTRY glutCloseFunc(void (* callback)(void));
 71 | FGAPI void    FGAPIENTRY glutWMCloseFunc(void (* callback)(void));
 72 | /* A. Donev: Also a destruction callback for menus */
 73 | FGAPI void    FGAPIENTRY glutMenuDestroyFunc(void (* callback)(void));
 74 | 
 75 | /*
 76 |  * State setting and retrieval functions, see freeglut_state.c
 77 |  */
 78 | FGAPI void    FGAPIENTRY glutSetOption(GLenum option_flag, int value) ;
 79 | /* A.Donev: User-data manipulation */
 80 | FGAPI void   *FGAPIENTRY glutGetWindowData(void);
 81 | FGAPI void    FGAPIENTRY glutSetWindowData(void *data);
 82 | FGAPI void   *FGAPIENTRY glutGetMenuData(void);
 83 | FGAPI void    FGAPIENTRY glutSetMenuData(void *data);
 84 | 
 85 | /*
 86 |  * Font stuff, see freeglut_font.c
 87 |  */
 88 | FGAPI int     FGAPIENTRY glutBitmapHeight(void *font);
 89 | FGAPI GLfloat FGAPIENTRY glutStrokeHeight(void *font);
 90 | FGAPI void    FGAPIENTRY glutBitmapString(void *font, const unsigned char *string);
 91 | FGAPI void    FGAPIENTRY glutStrokeString(void *font, const unsigned char *string);
 92 | 
 93 | /*
 94 |  * Geometry functions, see freeglut_geometry.c
 95 |  */
 96 | FGAPI void    FGAPIENTRY glutWireRhombicDodecahedron(void);
 97 | FGAPI void    FGAPIENTRY glutSolidRhombicDodecahedron(void);
 98 | FGAPI void    FGAPIENTRY glutWireSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ;
 99 | FGAPI void    FGAPIENTRY glutSolidSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ;
100 | FGAPI void    FGAPIENTRY glutWireCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks);
101 | FGAPI void    FGAPIENTRY glutSolidCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks);
102 | 
103 | /*
104 |  * Extension functions, see freeglut_ext.c
105 |  */
106 | FGAPI void *FGAPIENTRY glutGetProcAddress(const char *procName);
107 | 
108 | 
109 | #ifdef __cplusplus
110 | }
111 | #endif
112 | 
113 | /*** END OF FILE ***/
114 | 
115 | #endif /* __FREEGLUT_EXT_H__ */
116 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesCPU.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_IMAGES_CPU_H
 29 | #define NV_UTIL_NPP_IMAGES_CPU_H
 30 | 
 31 | #include "ImagePacked.h"
 32 | 
 33 | #include "ImageAllocatorsCPU.h"
 34 | #include "Exceptions.h"
 35 | 
 36 | #include <npp.h>
 37 | 
 38 | 
 39 | namespace npp
 40 | {
 41 | 
 42 |     template<typename D, unsigned int N, class A>
 43 |     class ImageCPU: public npp::ImagePacked<D, N, A>
 44 |     {
 45 |         public:
 46 | 
 47 |             ImageCPU()
 48 |             {
 49 |                 ;
 50 |             }
 51 | 
 52 |             ImageCPU(unsigned int nWidth, unsigned int nHeight): ImagePacked<D, N, A>(nWidth, nHeight)
 53 |             {
 54 |                 ;
 55 |             }
 56 | 
 57 |             explicit
 58 |             ImageCPU(const npp::Image::Size &rSize): ImagePacked<D, N, A>(rSize)
 59 |             {
 60 |                 ;
 61 |             }
 62 | 
 63 |             ImageCPU(const ImageCPU<D, N, A> &rImage): Image(rImage)
 64 |             {
 65 |                 ;
 66 |             }
 67 | 
 68 |             virtual
 69 |             ~ImageCPU()
 70 |             {
 71 |                 ;
 72 |             }
 73 | 
 74 |             ImageCPU &
 75 |             operator= (const ImageCPU<D, N, A> &rImage)
 76 |             {
 77 |                 ImagePacked<D, N, A>::operator= (rImage);
 78 | 
 79 |                 return *this;
 80 |             }
 81 | 
 82 |             npp::Pixel<D, N> &
 83 |             operator()(unsigned int iX, unsigned int iY)
 84 |             {
 85 |                 return *ImagePacked<D, N, A>::pixels(iX, iY);
 86 |             }
 87 | 
 88 |             npp::Pixel<D, N>
 89 |             operator()(unsigned int iX, unsigned int iY)
 90 |             const
 91 |             {
 92 |                 return *ImagePacked<D, N, A>::pixels(iX, iY);
 93 |             }
 94 | 
 95 |     };
 96 | 
 97 | 
 98 |     typedef ImageCPU<Npp8u,  1, npp::ImageAllocatorCPU<Npp8u,      1>  >   ImageCPU_8u_C1;
 99 |     typedef ImageCPU<Npp8u,  2, npp::ImageAllocatorCPU<Npp8u,      2>  >   ImageCPU_8u_C2;
100 |     typedef ImageCPU<Npp8u,  3, npp::ImageAllocatorCPU<Npp8u,      3>  >   ImageCPU_8u_C3;
101 |     typedef ImageCPU<Npp8u,  4, npp::ImageAllocatorCPU<Npp8u,      4>  >   ImageCPU_8u_C4;
102 | 
103 |     typedef ImageCPU<Npp16u, 1, npp::ImageAllocatorCPU<Npp16u,     1>  >   ImageCPU_16u_C1;
104 |     typedef ImageCPU<Npp16u, 3, npp::ImageAllocatorCPU<Npp16u,     3>  >   ImageCPU_16u_C3;
105 |     typedef ImageCPU<Npp16u, 4, npp::ImageAllocatorCPU<Npp16u,     4>  >   ImageCPU_16u_C4;
106 | 
107 |     typedef ImageCPU<Npp16s, 1, npp::ImageAllocatorCPU<Npp16s,     1>  >   ImageCPU_16s_C1;
108 |     typedef ImageCPU<Npp16s, 3, npp::ImageAllocatorCPU<Npp16s,     3>  >   ImageCPU_16s_C3;
109 |     typedef ImageCPU<Npp16s, 4, npp::ImageAllocatorCPU<Npp16s,     4>  >   ImageCPU_16s_C4;
110 | 
111 |     typedef ImageCPU<Npp32s, 1, npp::ImageAllocatorCPU<Npp32s,     1>  >   ImageCPU_32s_C1;
112 |     typedef ImageCPU<Npp32s, 3, npp::ImageAllocatorCPU<Npp32s,     3>  >   ImageCPU_32s_C3;
113 |     typedef ImageCPU<Npp32s, 4, npp::ImageAllocatorCPU<Npp32s,     4>  >   ImageCPU_32s_C4;
114 | 
115 |     typedef ImageCPU<Npp32f, 1, npp::ImageAllocatorCPU<Npp32f,     1>  >   ImageCPU_32f_C1;
116 |     typedef ImageCPU<Npp32f, 3, npp::ImageAllocatorCPU<Npp32f,     3>  >   ImageCPU_32f_C3;
117 |     typedef ImageCPU<Npp32f, 4, npp::ImageAllocatorCPU<Npp32f,     4>  >   ImageCPU_32f_C4;
118 | 
119 | } // npp namespace
120 | 
121 | #endif // NV_IMAGE_IPP_H
122 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Image.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_IMAGE_H
 29 | #define NV_UTIL_NPP_IMAGE_H
 30 | 
 31 | #include <cstddef>
 32 | 
 33 | namespace npp
 34 | {
 35 | 
 36 |     class Image
 37 |     {
 38 |         public:
 39 |             struct Size
 40 |             {
 41 |                 unsigned int nWidth;
 42 |                 unsigned int nHeight;
 43 | 
 44 |                 Size() : nWidth(0), nHeight(0)
 45 |                 { };
 46 | 
 47 |                 Size(unsigned int nWidthNew, unsigned nHeightNew) : nWidth(nWidthNew), nHeight(nHeightNew)
 48 |                 { };
 49 | 
 50 |                 Size(const Size &rSize) : nWidth(rSize.nWidth), nHeight(rSize.nHeight)
 51 |                 { };
 52 | 
 53 |                 Size &
 54 |                 operator= (const Size &rSize)
 55 |                 {
 56 |                     if (&rSize == this)
 57 |                     {
 58 |                         return *this;
 59 |                     }
 60 | 
 61 |                     nWidth = rSize.nWidth;
 62 |                     nHeight = rSize.nHeight;
 63 | 
 64 |                     return *this;
 65 |                 }
 66 | 
 67 |                 void
 68 |                 swap(Size &rSize)
 69 |                 {
 70 |                     unsigned int nTemp;
 71 |                     nTemp = nWidth;
 72 |                     nWidth = rSize.nWidth;
 73 |                     rSize.nWidth = nTemp;
 74 | 
 75 |                     nTemp = nHeight;
 76 |                     nHeight = rSize.nHeight;
 77 |                     rSize.nHeight = nTemp;
 78 |                 }
 79 |             };
 80 | 
 81 |             Image()
 82 |             { };
 83 | 
 84 |             Image(unsigned int nWidth, unsigned int nHeight) : oSize_(nWidth, nHeight)
 85 |             { };
 86 | 
 87 |             Image(const Image::Size &rSize) : oSize_(rSize)
 88 |             { };
 89 | 
 90 |             Image(const Image &rImage) : oSize_(rImage.oSize_)
 91 |             { };
 92 | 
 93 |             virtual
 94 |             ~Image()
 95 |             { };
 96 | 
 97 |             Image &
 98 |             operator= (const Image &rImage)
 99 |             {
100 |                 if (&rImage == this)
101 |                 {
102 |                     return *this;
103 |                 }
104 | 
105 |                 oSize_  = rImage.oSize_;
106 |                 return *this;
107 |             };
108 | 
109 |             unsigned int
110 |             width()
111 |             const
112 |             {
113 |                 return oSize_.nWidth;
114 |             }
115 | 
116 |             unsigned int
117 |             height()
118 |             const
119 |             {
120 |                 return oSize_.nHeight;
121 |             }
122 | 
123 |             Size
124 |             size()
125 |             const
126 |             {
127 |                 return oSize_;
128 |             }
129 | 
130 |             void
131 |             swap(Image &rImage)
132 |             {
133 |                 oSize_.swap(rImage.oSize_);
134 |             }
135 | 
136 |         private:
137 |             Size oSize_;
138 |     };
139 | 
140 |     bool
141 |     operator== (const Image::Size &rFirst, const Image::Size &rSecond)
142 |     {
143 |         return rFirst.nWidth == rSecond.nWidth && rFirst.nHeight == rSecond.nHeight;
144 |     }
145 | 
146 |     bool
147 |     operator!= (const Image::Size &rFirst, const Image::Size &rSecond)
148 |     {
149 |         return rFirst.nWidth != rSecond.nWidth || rFirst.nHeight != rSecond.nHeight;
150 |     }
151 | 
152 | } // npp namespace
153 | 
154 | 
155 | #endif // NV_UTIL_NPP_IMAGE_H
156 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/mpi/jacobi_kernels.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | #include <cstdio>
28 | 
29 | #define BLOCK_DIM_X 32
30 | #define BLOCK_DIM_Y 32
31 | 
32 | #define CUDA_RT_CALL(call)                                                                  \
33 |     {                                                                                       \
34 |         cudaError_t cudaStatus = call;                                                      \
35 |         if (cudaSuccess != cudaStatus)                                                      \
36 |             fprintf(stderr,                                                                 \
37 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
38 |                     "with "                                                                 \
39 |                     "%s (%d).\n",                                                           \
40 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
41 |     }
42 | 
43 | __global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
44 |                     const int nx, const int my_ny, const int ny) {
45 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
46 |         const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
47 |         a[iy * nx + 0] = y0;
48 |         a[iy * nx + (nx - 1)] = y0;
49 |         a_new[iy * nx + 0] = y0;
50 |         a_new[iy * nx + (nx - 1)] = y0;
51 |     }
52 | }
53 | 
54 | __global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
55 |                               const int iy_end, const int nx) {
56 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
57 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
58 |     __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
59 |     unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
60 | 
61 |     if (iy < iy_end && ix < (nx - 1)) {
62 |         // Update grid point
63 |         const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
64 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
65 |         a_new[iy * nx + ix] = new_val;
66 |         float residue = new_val - a[iy * nx + ix];
67 |         // Set block-level L2 norm value for this grid point
68 |         block_l2_sum[thread_index] = residue * residue;
69 |     }
70 |     else {
71 |         block_l2_sum[thread_index] = 0;
72 |     }
73 |     // Reduce L2 norm for the block in parallel
74 |     for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
75 |         __syncthreads();
76 |         if ((thread_index) % (2*stride) == 0) {
77 |             block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
78 |         }
79 |     }
80 |     // Atomically update global L2 norm with block-reduced L2 norm
81 |     if (thread_index == 0) {
82 |         atomicAdd(l2_norm, block_l2_sum[0]);
83 |     }
84 | }
85 | 
86 | void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
87 |                                     const int nx, const int my_ny, const int ny){
88 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
89 | }
90 | 
91 | void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
92 |                               const int iy_end, const int nx) {
93 |     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
94 |     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
95 |                   ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
96 |     jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
97 | }
98 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/mpi/containerization/jacobi_kernels.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | #include <cstdio>
28 | 
29 | #define BLOCK_DIM_X 32
30 | #define BLOCK_DIM_Y 32
31 | 
32 | #define CUDA_RT_CALL(call)                                                                  \
33 |     {                                                                                       \
34 |         cudaError_t cudaStatus = call;                                                      \
35 |         if (cudaSuccess != cudaStatus)                                                      \
36 |             fprintf(stderr,                                                                 \
37 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
38 |                     "with "                                                                 \
39 |                     "%s (%d).\n",                                                           \
40 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
41 |     }
42 | 
43 | __global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
44 |                     const int nx, const int my_ny, const int ny) {
45 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
46 |         const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
47 |         a[iy * nx + 0] = y0;
48 |         a[iy * nx + (nx - 1)] = y0;
49 |         a_new[iy * nx + 0] = y0;
50 |         a_new[iy * nx + (nx - 1)] = y0;
51 |     }
52 | }
53 | 
54 | __global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
55 |                               const int iy_end, const int nx) {
56 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
57 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
58 |     __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
59 |     unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
60 | 
61 |     if (iy < iy_end && ix < (nx - 1)) {
62 |         // Update grid point
63 |         const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
64 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
65 |         a_new[iy * nx + ix] = new_val;
66 |         float residue = new_val - a[iy * nx + ix];
67 |         // Set block-level L2 norm value for this grid point
68 |         block_l2_sum[thread_index] = residue * residue;
69 |     }
70 |     else {
71 |         block_l2_sum[thread_index] = 0;
72 |     }
73 |     // Reduce L2 norm for the block in parallel
74 |     for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
75 |         __syncthreads();
76 |         if ((thread_index) % (2*stride) == 0) {
77 |             block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
78 |         }
79 |     }
80 |     // Atomically update global L2 norm with block-reduced L2 norm
81 |     if (thread_index == 0) {
82 |         atomicAdd(l2_norm, block_l2_sum[0]);
83 |     }
84 | }
85 | 
86 | void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
87 |                                     const int nx, const int my_ny, const int ny){
88 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
89 | }
90 | 
91 | void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
92 |                               const int iy_end, const int nx) {
93 |     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
94 |     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
95 |                   ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
96 |     jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
97 | }
98 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/nccl/jacobi_kernels.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | #include <cstdio>
28 | 
29 | #define BLOCK_DIM_X 32
30 | #define BLOCK_DIM_Y 32
31 | 
32 | #define CUDA_RT_CALL(call)                                                                  \
33 |     {                                                                                       \
34 |         cudaError_t cudaStatus = call;                                                      \
35 |         if (cudaSuccess != cudaStatus)                                                      \
36 |             fprintf(stderr,                                                                 \
37 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
38 |                     "with "                                                                 \
39 |                     "%s (%d).\n",                                                           \
40 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
41 |     }
42 | 
43 | __global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
44 |                     const int nx, const int my_ny, const int ny) {
45 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
46 |         const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
47 |         a[iy * nx + 0] = y0;
48 |         a[iy * nx + (nx - 1)] = y0;
49 |         a_new[iy * nx + 0] = y0;
50 |         a_new[iy * nx + (nx - 1)] = y0;
51 |     }
52 | }
53 | 
54 | __global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
55 |                               const int iy_end, const int nx) {
56 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
57 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
58 |     __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
59 |     unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
60 | 
61 |     if (iy < iy_end && ix < (nx - 1)) {
62 |         // Update grid point
63 |         const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
64 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
65 |         a_new[iy * nx + ix] = new_val;
66 |         float residue = new_val - a[iy * nx + ix];
67 |         // Set block-level L2 norm value for this grid point
68 |         block_l2_sum[thread_index] = residue * residue;
69 |     }
70 |     else {
71 |         block_l2_sum[thread_index] = 0;
72 |     }
73 |     // Reduce L2 norm for the block in parallel
74 |     for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
75 |         __syncthreads();
76 |         if ((thread_index) % (2*stride) == 0) {
77 |             block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
78 |         }
79 |     }
80 |     // Atomically update global L2 norm with block-reduced L2 norm
81 |     if (thread_index == 0) {
82 |         atomicAdd(l2_norm, block_l2_sum[0]);
83 |     }
84 | }
85 | 
86 | void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
87 |                                     const int nx, const int my_ny, const int ny) {
88 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
89 | }
90 | 
91 | void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
92 |                               const int iy_end, const int nx, cudaStream_t stream) {
93 |     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
94 |     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
95 |                   ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
96 |     jacobi_kernel<<<dim_grid, dim_block, 0, stream>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | ////////////////////////////////////////////////////////////////////////////////
 29 | //
 30 | //  Utility funcs to wrap up saving a surface or the back buffer as a PPM file
 31 | //  In addition, wraps up a threshold comparision of two PPMs.
 32 | //
 33 | //  These functions are designed to be used to implement an automated QA testing for SDK samples.
 34 | //
 35 | //  Author: Bryan Dudash
 36 | //  Email: sdkfeedback@nvidia.com
 37 | //
 38 | // Copyright (c) NVIDIA Corporation. All rights reserved.
 39 | ////////////////////////////////////////////////////////////////////////////////
 40 | 
 41 | #include <helper_functions.h>
 42 | #include <rendercheck_d3d11.h>
 43 | 
 44 | HRESULT CheckRenderD3D11::ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName)
 45 | {
 46 |     ID3D11DeviceContext *pDeviceCtxt;
 47 |     pDevice->GetImmediateContext(&pDeviceCtxt);
 48 |     ID3D11RenderTargetView *pRTV = NULL;
 49 |     pDeviceCtxt->OMGetRenderTargets(1,&pRTV,NULL);
 50 | 
 51 |     ID3D11Resource *pSourceResource = NULL;
 52 |     pRTV->GetResource(&pSourceResource);
 53 | 
 54 |     return ResourceToPPM(pDevice,pSourceResource,zFileName);
 55 | }
 56 | 
 57 | HRESULT CheckRenderD3D11::ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName)
 58 | {
 59 |     ID3D11DeviceContext *pDeviceCtxt;
 60 |     pDevice->GetImmediateContext(&pDeviceCtxt);
 61 |     D3D11_RESOURCE_DIMENSION rType;
 62 |     pResource->GetType(&rType);
 63 | 
 64 |     if (rType != D3D11_RESOURCE_DIMENSION_TEXTURE2D)
 65 |     {
 66 |         printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
 67 |         return E_FAIL;
 68 |     }
 69 | 
 70 |     ID3D11Texture2D *pSourceTexture = (ID3D11Texture2D *)pResource;
 71 |     ID3D11Texture2D *pTargetTexture = NULL;
 72 | 
 73 |     D3D11_TEXTURE2D_DESC desc;
 74 |     pSourceTexture->GetDesc(&desc);
 75 |     desc.BindFlags = 0;
 76 |     desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
 77 |     desc.Usage = D3D11_USAGE_STAGING;
 78 | 
 79 |     if (FAILED(pDevice->CreateTexture2D(&desc,NULL,&pTargetTexture)))
 80 |     {
 81 |         printf("SurfaceToPPM: Unable to create target Texture resoruce! Aborting... \n");
 82 |         return E_FAIL;
 83 |     }
 84 | 
 85 |     pDeviceCtxt->CopyResource(pTargetTexture,pSourceTexture);
 86 | 
 87 |     D3D11_MAPPED_SUBRESOURCE mappedTex2D;
 88 |     pDeviceCtxt->Map(pTargetTexture, 0, D3D11_MAP_READ,0,&mappedTex2D);
 89 | 
 90 |     // Need to convert from dx pitch to pitch=width
 91 |     unsigned char *pPPMData = new unsigned char[desc.Width*desc.Height*4];
 92 | 
 93 |     for (unsigned int iHeight = 0; iHeight<desc.Height; iHeight++)
 94 |     {
 95 |         memcpy(&(pPPMData[iHeight*desc.Width*4]),(unsigned char *)(mappedTex2D.pData)+iHeight*mappedTex2D.RowPitch,desc.Width*4);
 96 |     }
 97 | 
 98 |     pDeviceCtxt->Unmap(pTargetTexture, 0);
 99 | 
100 |     // Prepends the PPM header info and bumps byte data afterwards
101 |     sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
102 | 
103 |     delete [] pPPMData;
104 |     pTargetTexture->Release();
105 | 
106 |     return S_OK;
107 | }
108 | 
109 | bool CheckRenderD3D11::PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path,
110 |                                 const float epsilon, const float threshold)
111 | {
112 |     char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
113 | 
114 |     if (ref_file_path == NULL)
115 |     {
116 |         printf("CheckRenderD3D11::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, exec_path);
117 |         printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
118 |         printf("Aborting comparison!\n");
119 |         printf("  FAILURE!\n");
120 |         return false;
121 |     }
122 | 
123 |     return sdkComparePPM(src_file,ref_file_path,epsilon,threshold,true) == true;
124 | }


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Signal.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_SIGNAL_H
 30 | #define NV_UTIL_NPP_SIGNAL_H
 31 | 
 32 | #include <cstring>
 33 | 
 34 | namespace npp
 35 | {
 36 |     class Signal
 37 |     {
 38 |         public:
 39 |             Signal() : nSize_(0)
 40 |             { };
 41 | 
 42 |             explicit
 43 |             Signal(size_t nSize) : nSize_(nSize)
 44 |             { };
 45 | 
 46 |             Signal(const Signal &rSignal) : nSize_(rSignal.nSize_)
 47 |             { };
 48 | 
 49 |             virtual
 50 |             ~Signal()
 51 |             { }
 52 | 
 53 |             Signal &
 54 |             operator= (const Signal &rSignal)
 55 |             {
 56 |                 nSize_ = rSignal.nSize_;
 57 |                 return *this;
 58 |             }
 59 | 
 60 |             size_t
 61 |             size()
 62 |             const
 63 |             {
 64 |                 return nSize_;
 65 |             }
 66 | 
 67 |             void
 68 |             swap(Signal &rSignal)
 69 |             {
 70 |                 size_t nTemp = nSize_;
 71 |                 nSize_ = rSignal.nSize_;
 72 |                 rSignal.nSize_ = nTemp;
 73 |             }
 74 | 
 75 | 
 76 |         private:
 77 |             size_t nSize_;
 78 |     };
 79 | 
 80 |     template<typename D, class A>
 81 |     class SignalTemplate: public Signal
 82 |     {
 83 |         public:
 84 |             typedef D tData;
 85 | 
 86 |             SignalTemplate(): aValues_(0)
 87 |             {
 88 |                 ;
 89 |             }
 90 | 
 91 |             SignalTemplate(size_t nSize): Signal(nSize)
 92 |                 , aValues_(0)
 93 |             {
 94 |                 aValues_ = A::Malloc1D(size());
 95 |             }
 96 | 
 97 |             SignalTemplate(const SignalTemplate<D, A> &rSignal): Signal(rSignal)
 98 |                 , aValues_(0)
 99 |             {
100 |                 aValues_ = A::Malloc1D(size());
101 |                 A::Copy1D(aValues_, rSignal.values(), size());
102 |             }
103 | 
104 |             virtual
105 |             ~SignalTemplate()
106 |             {
107 |                 A::Free1D(aValues_);
108 |             }
109 | 
110 |             SignalTemplate &
111 |             operator= (const SignalTemplate<D, A> &rSignal)
112 |             {
113 |                 // in case of self-assignment
114 |                 if (&rSignal == this)
115 |                 {
116 |                     return *this;
117 |                 }
118 | 
119 |                 A::Free1D(aValues_);
120 |                 this->aPixels_ = 0;
121 | 
122 |                 // assign parent class's data fields (width, height)
123 |                 Signal::operator =(rSignal);
124 | 
125 |                 aValues_ = A::Malloc1D(size());
126 |                 A::Copy1D(aValues_, rSignal.value(), size());
127 | 
128 |                 return *this;
129 |             }
130 | 
131 |             /// Get a pointer to the pixel array.
132 |             ///     The result pointer can be offset to pixel at position (x, y) and
133 |             /// even negative offsets are allowed.
134 |             /// \param nX Horizontal pointer/array offset.
135 |             /// \param nY Vertical pointer/array offset.
136 |             /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY).
137 |             tData *
138 |             values(int i = 0)
139 |             {
140 |                 return aValues_ + i;
141 |             }
142 | 
143 |             const
144 |             tData *
145 |             values(int i = 0)
146 |             const
147 |             {
148 |                 return aValues_ + i;
149 |             }
150 | 
151 |             void
152 |             swap(SignalTemplate<D, A> &rSignal)
153 |             {
154 |                 Signal::swap(rSignal);
155 | 
156 |                 tData *aTemp       = this->aValues_;
157 |                 this->aValues_      = rSignal.aValues_;
158 |                 rSignal.aValues_    = aTemp;
159 |             }
160 | 
161 |         private:
162 |             D *aValues_;
163 |     };
164 | 
165 | } // npp namespace
166 | 
167 | 
168 | #endif // NV_UTIL_NPP_SIGNAL_H
169 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cusolver.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef HELPER_CUSOLVER
 29 | #define HELPER_CUSOLVER
 30 | 
 31 | #include <ctype.h>
 32 | #include <cuda_runtime.h>
 33 | #include <math.h>
 34 | #include <stdio.h>
 35 | #include <stdlib.h>
 36 | #include <string.h>
 37 | 
 38 | #include "cusparse.h"
 39 | 
 40 | #define SWITCH_CHAR '-'
 41 | 
 42 | struct testOpts {
 43 |   char *sparse_mat_filename;  // by switch -F<filename>
 44 |   const char *testFunc;       // by switch -R<name>
 45 |   const char *reorder;        // by switch -P<name>
 46 |   int lda;                    // by switch -lda<int>
 47 | };
 48 | 
 49 | double vec_norminf(int n, const double *x) {
 50 |   double norminf = 0;
 51 |   for (int j = 0; j < n; j++) {
 52 |     double x_abs = fabs(x[j]);
 53 |     norminf = (norminf > x_abs) ? norminf : x_abs;
 54 |   }
 55 |   return norminf;
 56 | }
 57 | 
 58 | /*
 59 |  * |A| = max { |A|*ones(m,1) }
 60 |  */
 61 | double mat_norminf(int m, int n, const double *A, int lda) {
 62 |   double norminf = 0;
 63 |   for (int i = 0; i < m; i++) {
 64 |     double sum = 0.0;
 65 |     for (int j = 0; j < n; j++) {
 66 |       double A_abs = fabs(A[i + j * lda]);
 67 |       sum += A_abs;
 68 |     }
 69 |     norminf = (norminf > sum) ? norminf : sum;
 70 |   }
 71 |   return norminf;
 72 | }
 73 | 
 74 | /*
 75 |  * |A| = max { |A|*ones(m,1) }
 76 |  */
 77 | double csr_mat_norminf(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
 78 |                        const double *csrValA, const int *csrRowPtrA,
 79 |                        const int *csrColIndA) {
 80 |   const int baseA =
 81 |       (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
 82 | 
 83 |   double norminf = 0;
 84 |   for (int i = 0; i < m; i++) {
 85 |     double sum = 0.0;
 86 |     const int start = csrRowPtrA[i] - baseA;
 87 |     const int end = csrRowPtrA[i + 1] - baseA;
 88 |     for (int colidx = start; colidx < end; colidx++) {
 89 |       // const int j = csrColIndA[colidx] - baseA;
 90 |       double A_abs = fabs(csrValA[colidx]);
 91 |       sum += A_abs;
 92 |     }
 93 |     norminf = (norminf > sum) ? norminf : sum;
 94 |   }
 95 |   return norminf;
 96 | }
 97 | 
 98 | void display_matrix(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
 99 |                     const double *csrValA, const int *csrRowPtrA,
100 |                     const int *csrColIndA) {
101 |   const int baseA =
102 |       (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
103 | 
104 |   printf("m = %d, n = %d, nnz = %d, matlab base-1\n", m, n, nnzA);
105 | 
106 |   for (int row = 0; row < m; row++) {
107 |     const int start = csrRowPtrA[row] - baseA;
108 |     const int end = csrRowPtrA[row + 1] - baseA;
109 |     for (int colidx = start; colidx < end; colidx++) {
110 |       const int col = csrColIndA[colidx] - baseA;
111 |       double Areg = csrValA[colidx];
112 |       printf("A(%d, %d) = %20.16E\n", row + 1, col + 1, Areg);
113 |     }
114 |   }
115 | }
116 | 
117 | #if defined(_WIN32)
118 | #if !defined(WIN32_LEAN_AND_MEAN)
119 | #define WIN32_LEAN_AND_MEAN
120 | #endif
121 | #include <windows.h>
122 | double second(void) {
123 |   LARGE_INTEGER t;
124 |   static double oofreq;
125 |   static int checkedForHighResTimer;
126 |   static BOOL hasHighResTimer;
127 | 
128 |   if (!checkedForHighResTimer) {
129 |     hasHighResTimer = QueryPerformanceFrequency(&t);
130 |     oofreq = 1.0 / (double)t.QuadPart;
131 |     checkedForHighResTimer = 1;
132 |   }
133 |   if (hasHighResTimer) {
134 |     QueryPerformanceCounter(&t);
135 |     return (double)t.QuadPart * oofreq;
136 |   } else {
137 |     return (double)GetTickCount() / 1000.0;
138 |   }
139 | }
140 | 
141 | #elif defined(__linux__) || defined(__QNX__)
142 | #include <stddef.h>
143 | #include <sys/resource.h>
144 | #include <sys/time.h>
145 | double second(void) {
146 |   struct timeval tv;
147 |   gettimeofday(&tv, NULL);
148 |   return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
149 | }
150 | 
151 | #elif defined(__APPLE__)
152 | #include <stddef.h>
153 | #include <sys/resource.h>
154 | #include <sys/sysctl.h>
155 | #include <sys/time.h>
156 | #include <sys/types.h>
157 | double second(void) {
158 |   struct timeval tv;
159 |   gettimeofday(&tv, NULL);
160 |   return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
161 | }
162 | #else
163 | #error unsupported platform
164 | #endif
165 | 
166 | #endif
167 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageIO.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_IMAGE_IO_H
 29 | #define NV_UTIL_NPP_IMAGE_IO_H
 30 | 
 31 | #include "ImagesCPU.h"
 32 | #include "ImagesNPP.h"
 33 | 
 34 | #include "FreeImage.h"
 35 | #include "Exceptions.h"
 36 | 
 37 | #include <string>
 38 | #include "string.h"
 39 | 
 40 | 
 41 | // Error handler for FreeImage library.
 42 | //  In case this handler is invoked, it throws an NPP exception.
 43 | void
 44 | FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char *zMessage)
 45 | {
 46 |     throw npp::Exception(zMessage);
 47 | }
 48 | 
 49 | namespace npp
 50 | {
 51 |     // Load a gray-scale image from disk.
 52 |     void
 53 |     loadImage(const std::string &rFileName, ImageCPU_8u_C1 &rImage)
 54 |     {
 55 |         // set your own FreeImage error handler
 56 |         FreeImage_SetOutputMessage(FreeImageErrorHandler);
 57 | 
 58 |         FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(rFileName.c_str());
 59 | 
 60 |         // no signature? try to guess the file format from the file extension
 61 |         if (eFormat == FIF_UNKNOWN)
 62 |         {
 63 |             eFormat = FreeImage_GetFIFFromFilename(rFileName.c_str());
 64 |         }
 65 | 
 66 |         NPP_ASSERT(eFormat != FIF_UNKNOWN);
 67 |         // check that the plugin has reading capabilities ...
 68 |         FIBITMAP *pBitmap;
 69 | 
 70 |         if (FreeImage_FIFSupportsReading(eFormat))
 71 |         {
 72 |             pBitmap = FreeImage_Load(eFormat, rFileName.c_str());
 73 |         }
 74 | 
 75 |         NPP_ASSERT(pBitmap != 0);
 76 |         // make sure this is an 8-bit single channel image
 77 |         NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK);
 78 |         NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8);
 79 | 
 80 |         // create an ImageCPU to receive the loaded image data
 81 |         ImageCPU_8u_C1 oImage(FreeImage_GetWidth(pBitmap), FreeImage_GetHeight(pBitmap));
 82 | 
 83 |         // Copy the FreeImage data into the new ImageCPU
 84 |         unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap);
 85 |         const Npp8u *pSrcLine = FreeImage_GetBits(pBitmap) + nSrcPitch * (FreeImage_GetHeight(pBitmap) -1);
 86 |         Npp8u *pDstLine = oImage.data();
 87 |         unsigned int nDstPitch = oImage.pitch();
 88 | 
 89 |         for (size_t iLine = 0; iLine < oImage.height(); ++iLine)
 90 |         {
 91 |             memcpy(pDstLine, pSrcLine, oImage.width() * sizeof(Npp8u));
 92 |             pSrcLine -= nSrcPitch;
 93 |             pDstLine += nDstPitch;
 94 |         }
 95 | 
 96 |         // swap the user given image with our result image, effecively
 97 |         // moving our newly loaded image data into the user provided shell
 98 |         oImage.swap(rImage);
 99 |     }
100 | 
101 |     // Save an gray-scale image to disk.
102 |     void
103 |     saveImage(const std::string &rFileName, const ImageCPU_8u_C1 &rImage)
104 |     {
105 |         // create the result image storage using FreeImage so we can easily
106 |         // save
107 |         FIBITMAP *pResultBitmap = FreeImage_Allocate(rImage.width(), rImage.height(), 8 /* bits per pixel */);
108 |         NPP_ASSERT_NOT_NULL(pResultBitmap);
109 |         unsigned int nDstPitch   = FreeImage_GetPitch(pResultBitmap);
110 |         Npp8u *pDstLine = FreeImage_GetBits(pResultBitmap) + nDstPitch * (rImage.height()-1);
111 |         const Npp8u *pSrcLine = rImage.data();
112 |         unsigned int nSrcPitch = rImage.pitch();
113 | 
114 |         for (size_t iLine = 0; iLine < rImage.height(); ++iLine)
115 |         {
116 |             memcpy(pDstLine, pSrcLine, rImage.width() * sizeof(Npp8u));
117 |             pSrcLine += nSrcPitch;
118 |             pDstLine -= nDstPitch;
119 |         }
120 | 
121 |         // now save the result image
122 |         bool bSuccess;
123 |         bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, rFileName.c_str(), 0) == TRUE;
124 |         NPP_ASSERT_MSG(bSuccess, "Failed to save result image.");
125 |     }
126 | 
127 |     // Load a gray-scale image from disk.
128 |     void
129 |     loadImage(const std::string &rFileName, ImageNPP_8u_C1 &rImage)
130 |     {
131 |         ImageCPU_8u_C1 oImage;
132 |         loadImage(rFileName, oImage);
133 |         ImageNPP_8u_C1 oResult(oImage);
134 |         rImage.swap(oResult);
135 |     }
136 | 
137 |     // Save an gray-scale image to disk.
138 |     void
139 |     saveImage(const std::string &rFileName, const ImageNPP_8u_C1 &rImage)
140 |     {
141 |         ImageCPU_8u_C1 oHostImage(rImage.size());
142 |         // copy the device result data
143 |         rImage.copyTo(oHostImage.data(), oHostImage.pitch());
144 |         saveImage(rFileName, oHostImage);
145 |     }
146 | }
147 | 
148 | 
149 | #endif // NV_UTIL_NPP_IMAGE_IO_H
150 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | Contributing
  2 | ------------
  3 | 
  4 | Please use the following guidelines when contributing to this project. 
  5 | 
  6 | Before contributing significant changes, please begin a discussion of the desired changes via a GitHub Issue to prevent doing unnecessary or overlapping work.
  7 | 
  8 | ## License
  9 | 
 10 | The preferred license for source code contributed to this project is the Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0) and for documentation, including Jupyter notebooks and text documentation, is the Creative Commons Attribution 4.0 International (CC BY 4.0) (https://creativecommons.org/licenses/by/4.0/). Contributions under other, compatible licenses will be considered on a case-by-case basis.
 11 | 
 12 | ## Styling
 13 | 
 14 | Please use the following style guidelines when making contributions.
 15 | 
 16 | ### Source Code
 17 | * Two-space indentation, no tabs
 18 | * To the extent possible, variable names should be descriptive
 19 | * Code should be documentation with detail like what function does and returns making the code readable. The code should also have proper license at the beginning of the file.
 20 | * Fortran codes should use free-form source files
 21 | * Fortran codes should not use implicit variable names and should use implicit none
 22 | * The following file extensions should be used appropriately
 23 | 	* C - .c
 24 | 	* C++ - .cpp
 25 | 	* CUDA C/C++ - .cu
 26 | 	* CUDA Fortran - .cuf
 27 | 	* Fortran - .F90
 28 | 
 29 | ### Jupyter Notebooks & Markdown
 30 | * When they appear inline with the text; directive names, clauses, function or subroutine names, variable names, file names, commands and command-line arguments should appear between two backticks.
 31 | * Code blocks should begin with three backticks and either 'cpp' or 'fortran' to enable appropriate source formatting and end with three backticks.
 32 | * Leave an empty line before and after the codeblock.
 33 | Emphasis, including quotes made for emphasis and introduction of new terms should be highlighted between a single pair of asterisks
 34 | * A level 1 heading should appear at the top of the notebook as the title of the notebook.
 35 | * A horizontal rule should appear between sections that begin with a level 2 heading.
 36 | 
 37 | Please refer to the following template for jupyter notebook styling in the [github](https://github.com/openhackathons-org/gpubootcamp/tree/master/misc).
 38 | 
 39 | ## Contributing Labs/Modules
 40 | 
 41 | ### Directory stucture for Github
 42 | 
 43 | Before starting to work on new lab it is important to follow the recommended git structure as shown below to avoid and reformatting:
 44 | 
 45 | ```
 46 | ├── labs/CFD
 47 | │   ├── LICENSE
 48 | │   └── English
 49 | │       ├── C
 50 | │           ├── images
 51 | │           ├── jupyter_notebook
 52 | │               ├── advanced_concepts
 53 | │               	├── x.ipynb
 54 | │               	└── ...
 55 | │               ├── cuda
 56 | │               ├── mpi
 57 | │               ├── nccl
 58 | │               ├── nvshmem
 59 | │               └── single_gpu
 60 | │           └── source_code
 61 | │               ├── cuda
 62 | │               	├── solutions
 63 | │               	├── x.cu
 64 | │               	└── ...
 65 | │               ├── mpi
 66 | │               ├── nccl
 67 | │               ├── nvshmem
 68 | │               ├── p2pBandwidthLatencyTest
 69 | │               └── single_gpu
 70 | │       ├── Presentations
 71 | │       └── start_here.ipynb  
 72 | ├── slurm_pmi_config
 73 | ├── README.md
 74 | └── Singularity
 75 | ```
 76 | 
 77 | Each lab will have following files/directories consisting of training material for the lab.
 78 | * `jupyter_notebook` folder: Consists of jupyter notebooks for a specific programming language.  
 79 | * `source_code` folder: Source codes are stored in a separate directory for a specific programming language (C/C++ and Fortran). Source code folder may optionally contain Makefile especially for HPC labs. This folder may also contains `SOLUTIONS` folder for all the related solutions to that particular lab.  
 80 | * presentations: Consists of presentations for the labs ( pdf format is preferred )
 81 | * Dockerfile and Singularity: Each lab should have both Docker and Singularity recipes.
 82 |  
 83 | The lab optionally may also add custom license in case of any deviation from the top level directory license ( Apache 2.0 ). 
 84 | 
 85 | ### Git Branching
 86 | 
 87 | Adding a new feature/lab will follow a forking workflow. Which means a feature branch development will happen on a forked repo which later gets merged into our original project (GPUHackathons.org) repository.
 88 | 
 89 | 
 90 | ![Git Branching Workflow](labs/CFD/English/C/images/git_branching.jpg)
 91 | 
 92 | The 5 main steps depicted in image above are as follows:
 93 | 1. Fork: To create a new lab/feature the GPUHackathons.org repository must be forked. Fork will create a snapshot of GPUHackathons.org repository at the time it was forked. Any new feature/lab that will be developed should be based on the develop branch of the repository.
 94 | 2.  Clone: Developer can than clone this new repository to local machine
 95 | Create Feature Branch: Create a new branch with a feature name in which your changes will be done. Recommend naming convention of feature branch is naming convention for branch: multigpu-<feature_name>. The new changes that developer makes can be added, committed and pushed
 96 | 3. Push: After the changes are committed, the developer pushes the changes to the remote branch. Push command helps the local changes to github repository
 97 | 4. Pull: Submit a pull request. Upon receiving pull request a Hackathon team reviewer/owner will review the changes and upon accepting it can be merged into the develop branch of GpuHacakthons.org
 98 | 
 99 | Git Branch details are as follows:
100 | 
101 | * master branch: Consists of the stable branch. 
102 | 	* origin/master to be the main branch where the source code of HEAD always reflects a production-ready state
103 | 	* Merge request is possible through:  develop branch
104 | * develop branch: branched from master branch
105 | 	* Must branch from: master branch
106 | 	* Must merge back into: master branch
107 | 	* It is the main development branch where the source code of HEAD always reflects a state with the latest delivered development changes for the next release.
108 | 	* When the source code in the develop branch reaches a stable point and is ready to be released, all of the changes should be merged back into master somehow and then tagged with a release number
109 | 	* All feature development should happen by forking GPUHackathons.org and branching from develop branch only.
110 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagePacked.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_IMAGE_PACKED_H
 29 | #define NV_UTIL_NPP_IMAGE_PACKED_H
 30 | 
 31 | #include "Image.h"
 32 | #include "Pixel.h"
 33 | 
 34 | namespace npp
 35 | {
 36 |     template<typename D, size_t N, class A>
 37 |     class ImagePacked: public npp::Image
 38 |     {
 39 |         public:
 40 |             typedef npp::Pixel<D, N>    tPixel;
 41 |             typedef D                   tData;
 42 |             static const size_t         gnChannels = N;
 43 |             typedef npp::Image::Size    tSize;
 44 | 
 45 |             ImagePacked(): aPixels_(0)
 46 |                 , nPitch_(0)
 47 |             {
 48 |                 ;
 49 |             }
 50 | 
 51 |             ImagePacked(unsigned int nWidth, unsigned int nHeight): Image(nWidth, nHeight)
 52 |                 , aPixels_(0)
 53 |                 , nPitch_(0)
 54 |             {
 55 |                 aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
 56 |             }
 57 | 
 58 |             ImagePacked(unsigned int nWidth, unsigned int nHeight, bool bTight): Image(nWidth, nHeight)
 59 |                 , aPixels_(0)
 60 |                 , nPitch_(0)
 61 |             {
 62 |                 aPixels_ = A::Malloc2D(width(), height(), &nPitch_, bTight);
 63 |             }
 64 | 
 65 |             ImagePacked(const tSize &rSize): Image(rSize)
 66 |                 , aPixels_(0)
 67 |                 , nPitch_(0)
 68 |             {
 69 |                 aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
 70 |             }
 71 | 
 72 |             ImagePacked(const ImagePacked<D, N, A> &rImage): Image(rImage)
 73 |                 , aPixels_(0)
 74 |                 , nPitch_(rImage.pitch())
 75 |             {
 76 |                 aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
 77 |                 A::Copy2D(aPixels_, nPitch_, rImage.pixels(), rImage.pitch(), width(), height());
 78 |             }
 79 | 
 80 |             virtual
 81 |             ~ImagePacked()
 82 |             {
 83 |                 A::Free2D(aPixels_);
 84 |             }
 85 | 
 86 |             ImagePacked &
 87 |             operator= (const ImagePacked<D, N, A> &rImage)
 88 |             {
 89 |                 // in case of self-assignment
 90 |                 if (&rImage == this)
 91 |                 {
 92 |                     return *this;
 93 |                 }
 94 | 
 95 |                 A::Free2D(aPixels_);
 96 |                 aPixels_ = 0;
 97 |                 nPitch_ = 0;
 98 | 
 99 |                 // assign parent class's data fields (width, height)
100 |                 Image::operator =(rImage);
101 | 
102 |                 aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
103 |                 A::Copy2D(aPixels_, nPitch_, rImage.data(), rImage.pitch(), width(), height());
104 | 
105 |                 return *this;
106 |             }
107 | 
108 |             unsigned int
109 |             pitch()
110 |             const
111 |             {
112 |                 return nPitch_;
113 |             }
114 | 
115 |             /// Get a pointer to the pixel array.
116 |             ///     The result pointer can be offset to pixel at position (x, y) and
117 |             /// even negative offsets are allowed.
118 |             /// \param nX Horizontal pointer/array offset.
119 |             /// \param nY Vertical pointer/array offset.
120 |             /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY).
121 |             tPixel *
122 |             pixels(int nX = 0, int nY = 0)
123 |             {
124 |                 return reinterpret_cast<tPixel *>(reinterpret_cast<unsigned char *>(aPixels_) + nY * pitch() + nX * gnChannels * sizeof(D));
125 |             }
126 | 
127 |             const
128 |             tPixel *
129 |             pixels(int nX = 0, int nY = 0)
130 |             const
131 |             {
132 |                 return reinterpret_cast<const tPixel *>(reinterpret_cast<unsigned char *>(aPixels_) + nY * pitch() + nX * gnChannels * sizeof(D));
133 |             }
134 | 
135 |             D *
136 |             data(int nX = 0, int nY = 0)
137 |             {
138 |                 return reinterpret_cast<D *>(pixels(nX, nY));
139 |             }
140 | 
141 |             const
142 |             D *
143 |             data(int nX = 0, int nY = 0)
144 |             const
145 |             {
146 |                 return reinterpret_cast<const D *>(pixels(nX, nY));
147 |             }
148 | 
149 |             void
150 |             swap(ImagePacked<D, N, A> &rImage)
151 |             {
152 |                 Image::swap(rImage);
153 | 
154 |                 tData *aTemp   = aPixels_;
155 |                 aPixels_        = rImage.aPixels_;
156 |                 rImage.aPixels_ = aTemp;
157 | 
158 |                 unsigned int nTemp = nPitch_;
159 |                 nPitch_            = rImage.nPitch_;
160 |                 rImage.nPitch_     = nTemp;
161 |             }
162 | 
163 |         private:
164 |             D *aPixels_;
165 |             unsigned int nPitch_;
166 |     };
167 | 
168 | } // npp namespace
169 | 
170 | 
171 | #endif // NV_IMAGE_IPP_H
172 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/exception.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | /* CUda UTility Library */
 29 | #ifndef COMMON_EXCEPTION_H_
 30 | #define COMMON_EXCEPTION_H_
 31 | 
 32 | // includes, system
 33 | #include <stdlib.h>
 34 | #include <exception>
 35 | #include <iostream>
 36 | #include <stdexcept>
 37 | #include <string>
 38 | 
 39 | //! Exception wrapper.
 40 | //! @param Std_Exception Exception out of namespace std for easy typing.
 41 | template <class Std_Exception>
 42 | class Exception : public Std_Exception {
 43 |  public:
 44 |   //! @brief Static construction interface
 45 |   //! @return Alwayss throws ( Located_Exception<Exception>)
 46 |   //! @param file file in which the Exception occurs
 47 |   //! @param line line in which the Exception occurs
 48 |   //! @param detailed details on the code fragment causing the Exception
 49 |   static void throw_it(const char *file, const int line,
 50 |                        const char *detailed = "-");
 51 | 
 52 |   //! Static construction interface
 53 |   //! @return Alwayss throws ( Located_Exception<Exception>)
 54 |   //! @param file file in which the Exception occurs
 55 |   //! @param line line in which the Exception occurs
 56 |   //! @param detailed details on the code fragment causing the Exception
 57 |   static void throw_it(const char *file, const int line,
 58 |                        const std::string &detailed);
 59 | 
 60 |   //! Destructor
 61 |   virtual ~Exception() throw();
 62 | 
 63 |  private:
 64 |   //! Constructor, default (private)
 65 |   Exception();
 66 | 
 67 |   //! Constructor, standard
 68 |   //! @param str string returned by what()
 69 |   explicit Exception(const std::string &str);
 70 | };
 71 | 
 72 | ////////////////////////////////////////////////////////////////////////////////
 73 | //! Exception handler function for arbitrary exceptions
 74 | //! @param ex exception to handle
 75 | ////////////////////////////////////////////////////////////////////////////////
 76 | template <class Exception_Typ>
 77 | inline void handleException(const Exception_Typ &ex) {
 78 |   std::cerr << ex.what() << std::endl;
 79 | 
 80 |   exit(EXIT_FAILURE);
 81 | }
 82 | 
 83 | //! Convenience macros
 84 | 
 85 | //! Exception caused by dynamic program behavior, e.g. file does not exist
 86 | #define RUNTIME_EXCEPTION(msg) \
 87 |   Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
 88 | 
 89 | //! Logic exception in program, e.g. an assert failed
 90 | #define LOGIC_EXCEPTION(msg) \
 91 |   Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
 92 | 
 93 | //! Out of range exception
 94 | #define RANGE_EXCEPTION(msg) \
 95 |   Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
 96 | 
 97 | ////////////////////////////////////////////////////////////////////////////////
 98 | //! Implementation
 99 | 
100 | // includes, system
101 | #include <sstream>
102 | 
103 | ////////////////////////////////////////////////////////////////////////////////
104 | //! Static construction interface.
105 | //! @param  Exception causing code fragment (file and line) and detailed infos.
106 | ////////////////////////////////////////////////////////////////////////////////
107 | /*static*/ template <class Std_Exception>
108 | void Exception<Std_Exception>::throw_it(const char *file, const int line,
109 |                                         const char *detailed) {
110 |   std::stringstream s;
111 | 
112 |   // Quiet heavy-weight but exceptions are not for
113 |   // performance / release versions
114 |   s << "Exception in file '" << file << "' in line " << line << "\n"
115 |     << "Detailed description: " << detailed << "\n";
116 | 
117 |   throw Exception(s.str());
118 | }
119 | 
120 | ////////////////////////////////////////////////////////////////////////////////
121 | //! Static construction interface.
122 | //! @param  Exception causing code fragment (file and line) and detailed infos.
123 | ////////////////////////////////////////////////////////////////////////////////
124 | /*static*/ template <class Std_Exception>
125 | void Exception<Std_Exception>::throw_it(const char *file, const int line,
126 |                                         const std::string &msg) {
127 |   throw_it(file, line, msg.c_str());
128 | }
129 | 
130 | ////////////////////////////////////////////////////////////////////////////////
131 | //! Constructor, default (private).
132 | ////////////////////////////////////////////////////////////////////////////////
133 | template <class Std_Exception>
134 | Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
135 | 
136 | ////////////////////////////////////////////////////////////////////////////////
137 | //! Constructor, standard (private).
138 | //! String returned by what().
139 | ////////////////////////////////////////////////////////////////////////////////
140 | template <class Std_Exception>
141 | Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
142 | 
143 | ////////////////////////////////////////////////////////////////////////////////
144 | //! Destructor
145 | ////////////////////////////////////////////////////////////////////////////////
146 | template <class Std_Exception>
147 | Exception<Std_Exception>::~Exception() throw() {}
148 | 
149 |   // functions, exported
150 | 
151 | #endif  // COMMON_EXCEPTION_H_
152 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/dynlink_d3d11.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | //--------------------------------------------------------------------------------------
 29 | // File: dynlink_d3d11.h
 30 | //
 31 | // Shortcut macros and functions for using DX objects
 32 | //
 33 | // Copyright (c) Microsoft Corporation. All rights reserved
 34 | //--------------------------------------------------------------------------------------
 35 | 
 36 | #ifndef _DYNLINK_D3D11_H_
 37 | #define _DYNLINK_D3D11_H_
 38 | 
 39 | // Standard Windows includes
 40 | #include <windows.h>
 41 | #include <initguid.h>
 42 | #include <assert.h>
 43 | #include <wchar.h>
 44 | #include <mmsystem.h>
 45 | #include <commctrl.h> // for InitCommonControls() 
 46 | #include <shellapi.h> // for ExtractIcon()
 47 | #include <new.h>      // for placement new
 48 | #include <shlobj.h>
 49 | #include <math.h>
 50 | #include <limits.h>
 51 | #include <stdio.h>
 52 | 
 53 | // CRT's memory leak detection
 54 | #if defined(DEBUG) || defined(_DEBUG)
 55 | #include <crtdbg.h>
 56 | #endif
 57 | 
 58 | // Direct3D10 includes
 59 | #include <dxgi.h>
 60 | #include <d3d11.h>
 61 | // #include <..\Samples\C++\Effects11\Inc\d3dx11effect.h>
 62 | 
 63 | // XInput includes
 64 | #include <xinput.h>
 65 | 
 66 | // strsafe.h deprecates old unsecure string functions.  If you
 67 | // really do not want to it to (not recommended), then uncomment the next line
 68 | //#define STRSAFE_NO_DEPRECATE
 69 | 
 70 | #ifndef STRSAFE_NO_DEPRECATE
 71 | #pragma deprecated("strncpy")
 72 | #pragma deprecated("wcsncpy")
 73 | #pragma deprecated("_tcsncpy")
 74 | #pragma deprecated("wcsncat")
 75 | #pragma deprecated("strncat")
 76 | #pragma deprecated("_tcsncat")
 77 | #endif
 78 | 
 79 | #pragma warning( disable : 4996 ) // disable deprecated warning 
 80 | #include <strsafe.h>
 81 | #pragma warning( default : 4996 )
 82 | 
 83 | typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
 84 | typedef HRESULT(WINAPI *LPD3D11CREATEDEVICEANDSWAPCHAIN)(__in_opt IDXGIAdapter *pAdapter, D3D_DRIVER_TYPE DriverType, HMODULE Software, UINT Flags, __in_ecount_opt(FeatureLevels) CONST D3D_FEATURE_LEVEL *pFeatureLevels, UINT FeatureLevels, UINT SDKVersion, __in_opt CONST DXGI_SWAP_CHAIN_DESC *pSwapChainDesc, __out_opt IDXGISwapChain **ppSwapChain, __out_opt ID3D11Device **ppDevice, __out_opt D3D_FEATURE_LEVEL *pFeatureLevel, __out_opt ID3D11DeviceContext **ppImmediateContext);
 85 | typedef HRESULT(WINAPI *LPD3D11CREATEDEVICE)(IDXGIAdapter *, D3D_DRIVER_TYPE, HMODULE, UINT32, D3D_FEATURE_LEVEL *, UINT, UINT32, ID3D11Device **, D3D_FEATURE_LEVEL *, ID3D11DeviceContext **);
 86 | 
 87 | static HMODULE                              s_hModDXGI = NULL;
 88 | static LPCREATEDXGIFACTORY                  sFnPtr_CreateDXGIFactory = NULL;
 89 | static HMODULE                              s_hModD3D11 = NULL;
 90 | static LPD3D11CREATEDEVICE                  sFnPtr_D3D11CreateDevice = NULL;
 91 | static LPD3D11CREATEDEVICEANDSWAPCHAIN      sFnPtr_D3D11CreateDeviceAndSwapChain = NULL;
 92 | 
 93 | // unload the D3D10 DLLs
 94 | static bool dynlinkUnloadD3D11API(void)
 95 | {
 96 |     if (s_hModDXGI)
 97 |     {
 98 |         FreeLibrary(s_hModDXGI);
 99 |         s_hModDXGI = NULL;
100 |     }
101 | 
102 |     if (s_hModD3D11)
103 |     {
104 |         FreeLibrary(s_hModD3D11);
105 |         s_hModD3D11 = NULL;
106 |     }
107 | 
108 |     return true;
109 | }
110 | 
111 | // Dynamically load the D3D11 DLLs loaded and map the function pointers
112 | static bool dynlinkLoadD3D11API(void)
113 | {
114 |     // If both modules are non-NULL, this function has already been called.  Note
115 |     // that this doesn't guarantee that all ProcAddresses were found.
116 |     if (s_hModD3D11 != NULL && s_hModDXGI != NULL)
117 |     {
118 |         return true;
119 |     }
120 | 
121 | #if 1
122 |     // This may fail if Direct3D 11 isn't installed
123 |     s_hModD3D11 = LoadLibrary("d3d11.dll");
124 | 
125 |     if (s_hModD3D11 != NULL)
126 |     {
127 |         sFnPtr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)GetProcAddress(s_hModD3D11, "D3D11CreateDevice");
128 |         sFnPtr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)GetProcAddress(s_hModD3D11, "D3D11CreateDeviceAndSwapChain");
129 |     }
130 |     else
131 |     {
132 |         printf("\nLoad d3d11.dll failed\n");
133 |         fflush(0);
134 |     }
135 | 
136 |     if (!sFnPtr_CreateDXGIFactory)
137 |     {
138 |         s_hModDXGI = LoadLibrary("dxgi.dll");
139 | 
140 |         if (s_hModDXGI)
141 |         {
142 |             sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY)GetProcAddress(s_hModDXGI, "CreateDXGIFactory1");
143 |         }
144 | 
145 |         return (s_hModDXGI != NULL) && (s_hModD3D11 != NULL);
146 |     }
147 | 
148 |     return (s_hModD3D11 != NULL);
149 | #else
150 |     sFnPtr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)D3D11CreateDeviceAndSwapChain;
151 |     sFnPtr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)D3D11CreateDeviceAndSwapChain;
152 |     //sFnPtr_D3DX11CreateEffectFromMemory  = ( LPD3DX11CREATEEFFECTFROMMEMORY )D3DX11CreateEffectFromMemory;
153 |     sFnPtr_D3DX11CompileFromMemory = (LPD3DX11COMPILEFROMMEMORY)D3DX11CompileFromMemory;
154 |     sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY)CreateDXGIFactory;
155 |     return true;
156 | #endif
157 |     return true;
158 | }
159 | 
160 | #endif
161 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesNPP.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_IMAGES_NPP_H
 30 | #define NV_UTIL_NPP_IMAGES_NPP_H
 31 | 
 32 | #include "Exceptions.h"
 33 | #include "ImagePacked.h"
 34 | 
 35 | #include "ImageAllocatorsNPP.h"
 36 | #include <cuda_runtime.h>
 37 | 
 38 | namespace npp
 39 | {
 40 |     // forward declaration
 41 |     template<typename D, unsigned int N, class A> class ImageCPU;
 42 | 
 43 |     template<typename D, unsigned int N>
 44 |     class ImageNPP: public npp::ImagePacked<D, N, npp::ImageAllocator<D, N> >
 45 |     {
 46 |         public:
 47 |             ImageNPP()
 48 |             {
 49 |                 ;
 50 |             }
 51 | 
 52 |             ImageNPP(unsigned int nWidth, unsigned int nHeight, bool bTight = false): ImagePacked<D, N, npp::ImageAllocator<D, N> >(nWidth, nHeight, bTight)
 53 |             {
 54 |                 ;
 55 |             }
 56 | 
 57 |             ImageNPP(const npp::Image::Size &rSize): ImagePacked<D, N, npp::ImageAllocator<D, N> >(rSize)
 58 |             {
 59 |                 ;
 60 |             }
 61 | 
 62 |             ImageNPP(const ImageNPP<D, N> &rImage): Image(rImage)
 63 |             {
 64 |                 ;
 65 |             }
 66 | 
 67 |             template<class X>
 68 |             explicit
 69 |             ImageNPP(const ImageCPU<D, N, X> &rImage, bool bTight = false): ImagePacked<D, N, npp::ImageAllocator<D, N> >(rImage.width(), rImage.height(), bTight)
 70 |             {
 71 |                 npp::ImageAllocator<D, N>::HostToDeviceCopy2D(ImagePacked<D, N, npp::ImageAllocator<D, N> >::data(),
 72 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::pitch(),
 73 |                                                               rImage.data(),
 74 |                                                               rImage.pitch(),
 75 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::width(),
 76 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::height());
 77 |             }
 78 | 
 79 |             virtual
 80 |             ~ImageNPP()
 81 |             {
 82 |                 ;
 83 |             }
 84 | 
 85 |             ImageNPP &
 86 |             operator= (const ImageNPP<D, N> &rImage)
 87 |             {
 88 |                 ImagePacked<D, N, npp::ImageAllocator<D, N> >::operator= (rImage);
 89 | 
 90 |                 return *this;
 91 |             }
 92 | 
 93 |             void
 94 |             copyTo(D *pData, unsigned int nPitch)
 95 |             const
 96 |             {
 97 |                 NPP_ASSERT((ImagePacked<D, N, npp::ImageAllocator<D, N> >::width() * sizeof(npp::Pixel<D, N>) <= nPitch));
 98 |                 npp::ImageAllocator<D, N>::DeviceToHostCopy2D(pData,
 99 |                                                               nPitch,
100 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::data(),
101 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::pitch(),
102 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::width(),
103 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::height());
104 |             }
105 | 
106 |             void
107 |             copyFrom(D *pData, unsigned int nPitch)
108 |             {
109 |                 NPP_ASSERT((ImagePacked<D, N, npp::ImageAllocator<D, N> >::width() * sizeof(npp::Pixel<D, N>) <= nPitch));
110 |                 npp::ImageAllocator<D, N>::HostToDeviceCopy2D(ImagePacked<D, N, npp::ImageAllocator<D, N> >::data(),
111 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::pitch(),
112 |                                                               pData,
113 |                                                               nPitch,
114 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::width(),
115 |                                                               ImagePacked<D, N, npp::ImageAllocator<D, N> >::height());
116 |             }
117 |     };
118 | 
119 |     typedef ImageNPP<Npp8u,  1>   ImageNPP_8u_C1;
120 |     typedef ImageNPP<Npp8u,  2>   ImageNPP_8u_C2;
121 |     typedef ImageNPP<Npp8u,  3>   ImageNPP_8u_C3;
122 |     typedef ImageNPP<Npp8u,  4>   ImageNPP_8u_C4;
123 | 
124 |     typedef ImageNPP<Npp16u, 1>  ImageNPP_16u_C1;
125 |     typedef ImageNPP<Npp16u, 2>  ImageNPP_16u_C2;
126 |     typedef ImageNPP<Npp16u, 3>  ImageNPP_16u_C3;
127 |     typedef ImageNPP<Npp16u, 4>  ImageNPP_16u_C4;
128 | 
129 |     typedef ImageNPP<Npp16s, 1>  ImageNPP_16s_C1;
130 |     typedef ImageNPP<Npp16s, 3>  ImageNPP_16s_C3;
131 |     typedef ImageNPP<Npp16s, 4>  ImageNPP_16s_C4;
132 | 
133 |     typedef ImageNPP<Npp32s, 1>  ImageNPP_32s_C1;
134 |     typedef ImageNPP<Npp32s, 3>  ImageNPP_32s_C3;
135 |     typedef ImageNPP<Npp32s, 4>  ImageNPP_32s_C4;
136 | 
137 |     typedef ImageNPP<Npp32f, 1>  ImageNPP_32f_C1;
138 |     typedef ImageNPP<Npp32f, 2>  ImageNPP_32f_C2;
139 |     typedef ImageNPP<Npp32f, 3>  ImageNPP_32f_C3;
140 |     typedef ImageNPP<Npp32f, 4>  ImageNPP_32f_C4;
141 | 
142 |     typedef ImageNPP<Npp64f, 1>  ImageNPP_64f_C1;
143 |     typedef ImageNPP<Npp64f, 2>  ImageNPP_64f_C2;
144 |     typedef ImageNPP<Npp64f, 3>  ImageNPP_64f_C3;
145 |     typedef ImageNPP<Npp64f, 4>  ImageNPP_64f_C4;
146 | 
147 | } // npp namespace
148 | 
149 | #endif // NV_UTIL_NPP_IMAGES_NPP_H
150 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ** Please note this repository is archived and no more actively maintained ** 
  2 | 
  3 | # N-Ways to Multi-GPU Programming
  4 | 
  5 | This repository contains mini applications for GPU Bootcamps. This bootcamp focuses on multi-GPU programming models.
  6 | 
  7 | Scaling applications to multiple GPUs across multiple nodes requires one to be adept at not just the programming models and optimization techniques, but also at performing root-cause analysis using in-depth profiling to identify and minimize bottlenecks. In this bootcamp, participants will learn to improve the performance of an application step-by-step, taking cues from profilers along the way. Moreover, understanding of the underlying technologies and communication topology will help us utilize high-performance NVIDIA libraries to extract more performance out of the system.
  8 | 
  9 | ## Bootcamp Outline
 10 | 
 11 | * Overview of single-GPU code and Nsight Systems Profiler
 12 | * Single Node Multi-GPU:
 13 |   - CUDA Memcpy and Peer-to-Peer Memory Access
 14 |   - Intra-node topology
 15 |   - CUDA Streams and Events
 16 | * Multi-Node Multi-GPU:
 17 |   - Introduction to MPI and Multi-Node execution overview
 18 |   - MPI with CUDA Memcpy
 19 |   - CUDA-aware MPI
 20 |   - Supplemental: Configuring MPI in a containerized environment
 21 | * NVIDIA Collectives Communications Library (NCCL)
 22 | * NVHSMEM Library
 23 | 
 24 | ## Prerequisites
 25 | 
 26 | This bootcamp requires a multi-node system with multiple GPUs in each node (atleast 2 GPUs/ node).
 27 | 
 28 | ## Tutorial Duration
 29 | 
 30 | The total bootcamp material  would take approximately 8 hours .
 31 | 
 32 | ### Using NVIDIA HPC SDK
 33 | 
 34 | A multi-node installation of [NVIDIA's HPC SDK](https://developer.nvidia.com/hpc-sdk) is desired. Refer to [NVIDIA HPC SDK Installation Guide](https://docs.nvidia.com/hpc-sdk/hpc-sdk-install-guide/index.html) for detailed instructions. Ensure that your installation contains HPCX with UCX. 
 35 | 
 36 | After installation, make sure to add HPC SDK to the environment as follows(For example the PATH highlighted below is for HPC SDK 21.5):
 37 | 
 38 | ```bash
 39 | # Add HPC-SDK to PATH:
 40 | export PATH="<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/compilers/bin:<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/cuda/bin:$PATH"
 41 | # Add HPC-SDK to LD_LIBRARY_PATH:
 42 | export LD_LIBRARY_PATH="<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/comm_libs/nvshmem/lib:<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/comm_libs/nccl/lib:<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/comm_libs/mpi/lib:<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/math_libs/lib64:<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/compilers/lib:<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/cuda/extras/CUPTI/lib64:<path-nvidia-hpc-sdk>>/Linux_x86_64/21.5/cuda/lib64:$LD_LIBRARY_PATH"
 43 | #ADD NVSHMEM HOME DIRECTORY PATH
 44 | export CUDA_HOME=<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/cuda
 45 | export NVSHMEM_HOME=<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/comm_libs/nvshmem
 46 | ```
 47 | **Note:** If you don't use Slurm workload manager, remove `--with-slurm` flag.
 48 | 
 49 | Then, install OpenMPI as follows:
 50 | 
 51 | ```bash
 52 | # Download and extract OpenMPI Tarfile
 53 | wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz
 54 | tar -xvzf openmpi-4.1.1.tar.gz
 55 | cd openmpi-4.1.1/
 56 | mkdir -p build
 57 | # Configure OpenMPI
 58 | ./configure --prefix=$PWD/build --with-libevent=internal --with-xpmem --with-cuda=<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/cuda/ --with-slurm --enable-mpi1-compatibility --with-verbs --with-hcoll=<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/hcoll/lib --with-ucx=<path-to-nvidia-hpc-sdk>/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/ucx/
 59 | # Install OpenMPI
 60 | make all install
 61 | ```
 62 | 
 63 | Now, add OpenMPI to the environment:
 64 | 
 65 | ```bash
 66 | export PATH="<path-to-openmpi>/build/bin/:$PATH"
 67 | export LD_LIBRARY_PATH="<path-to-openmpi/build/lib:$LD_LIBRARY_PATH"
 68 | ```
 69 | 
 70 | Ensure that the custom-built OpenMPI is in use by running `which mpirun` which should point the `mpirun` binary in `<path-to-openmpi>/build/bin` directory.
 71 | 
 72 | ### Without Using NVIDIA HPC SDK
 73 | 
 74 | Multi-node compatible versions of the following are required:
 75 | 
 76 | * [OpenMPI](https://www.open-mpi.org/)
 77 | * [HPCX](https://developer.nvidia.com/networking/hpc-x)
 78 | * [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
 79 | * [NCCL](https://developer.nvidia.com/nccl)
 80 | * [NVSHMEM](https://developer.nvidia.com/nvshmem)
 81 | 
 82 | ## Testing
 83 | 
 84 | We have tested all the codes with CUDA drivers 460.32.03 with CUDA 11.3.0.0, OpenMPI 4.1.1, HPCX 2.8.1, Singularity 3.6.1, NCCL 2.9.9.1, and NVSHMEM 2.1.2. Note that OpenMPI in our cluster was compiled with CUDA, HCOLL, and UCX support.
 85 | 
 86 | ## Running Jupyter Lab
 87 | 
 88 | As this bootcamp covers multi-node CUDA-aware MPI concepts, it is primarily designed to run without any containers. After the prerequisite softwares have been installed, follow these steps to install and run Jupyter Lab:
 89 | 
 90 | ```bash
 91 | # Install Anaconda3
 92 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
 93 | bash Miniconda3-latest-Linux-x86_64.sh -b -p <my_dir>
 94 | # Add conda to PATH
 95 | export PATH=$PATH:<my_dir>/bin/
 96 | # Install Jupyter Lab
 97 | conda install -c conda-forge jupyterlab
 98 | # Run Jupyter Lab
 99 | jupyter lab --notebook-dir=<path-to-gpubootcamp-repo>/hpc/multi_gpu_nways/labs/ --port=8000 --ip=0.0.0.0 --no-browser --NotebookApp.token=""
100 | ```
101 | 
102 | After running Jupyter Lab, open [http://localhost:8888](http://localhost:8888/) in a web browser and start the `introduction.ipynb` notebook.
103 | 
104 | ## Optional: Containerized Build with Singularity
105 | 
106 | This material is designed to primarily run in containerless environments, that is, directly on the cluster. Thus, building the Singularity container is OPTIONAL.
107 | 
108 | If containerization is desired, follow the steps outlined in the notebook [MPI in Containerized Environments](labs/CFD/English/C/jupyter_notebook/mpi/containers_and_mpi.ipynb).
109 | 
110 | Follow the steps below to build the Singularity container image and run Jupyter Lab:
111 | 
112 | ```bash
113 | # Build the container
114 | singularity build multi_gpu_nways.simg Singularity
115 | # Run Jupyter Lab
116 | singularity run --nv multi_gpu_nways.simg jupyter lab --notebook-dir=<path-to-gpubootcamp-repo>/hpc/multi_gpu_nways/labs/ --port=8000 --ip=0.0.0.0 --no-browser --NotebookApp.token="" 
117 | ```
118 | 
119 | Then, access Jupyter Lab on [http://localhost:8888](http://localhost:8888/).
120 | 
121 | 
122 | ## Known issues
123 | 
124 | #### Compiler throws errors
125 | 
126 | If compiling any program throws an error related to CUDA/ NCCL/ NVHSMEM/ MPI libraries or header files being not found, ensure that `LD_LIBRARY_PATH` is correctly set. Moreover, make sure environment variables `CUDA_HOME`, `NCCL_HOME`, and `NVSHMEM_HOME` are set either during installation or manually inside each `Makefile`.
127 | 
128 | - Please go through the list of exisiting bugs/issues or file a new issue at [Github](https://github.com/gpuhackathons-org/gpubootcamp/issues).
129 | 
130 | 
131 | ## Questions?
132 | 
133 | Please join [OpenACC Slack Channel](https://openacclang.slack.com/messages/openaccusergroup) to raise questions.
134 | 
135 | If you observe any errors or issues, please file an issue on [GPUBootcamp GitHuB repository](https://github.com/gpuhackathons-org/gpubootcamp).
136 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/nvrtc_helper.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef COMMON_NVRTC_HELPER_H_
 29 | 
 30 | #define COMMON_NVRTC_HELPER_H_ 1
 31 | 
 32 | #include <cuda.h>
 33 | #include <helper_cuda_drvapi.h>
 34 | #include <nvrtc.h>
 35 | #include <fstream>
 36 | #include <iostream>
 37 | #include <sstream>
 38 | #include <string>
 39 | 
 40 | #define NVRTC_SAFE_CALL(Name, x)                                \
 41 |   do {                                                          \
 42 |     nvrtcResult result = x;                                     \
 43 |     if (result != NVRTC_SUCCESS) {                              \
 44 |       std::cerr << "\nerror: " << Name << " failed with error " \
 45 |                 << nvrtcGetErrorString(result);                 \
 46 |       exit(1);                                                  \
 47 |     }                                                           \
 48 |   } while (0)
 49 | 
 50 | void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResult,
 51 |                       size_t *cubinResultSize, int requiresCGheaders) {
 52 |   std::ifstream inputFile(filename,
 53 |                           std::ios::in | std::ios::binary | std::ios::ate);
 54 | 
 55 |   if (!inputFile.is_open()) {
 56 |     std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
 57 |     exit(1);
 58 |   }
 59 | 
 60 |   std::streampos pos = inputFile.tellg();
 61 |   size_t inputSize = (size_t)pos;
 62 |   char *memBlock = new char[inputSize + 1];
 63 | 
 64 |   inputFile.seekg(0, std::ios::beg);
 65 |   inputFile.read(memBlock, inputSize);
 66 |   inputFile.close();
 67 |   memBlock[inputSize] = '\x0';
 68 | 
 69 |   int numCompileOptions = 0;
 70 | 
 71 |   char *compileParams[2];
 72 | 
 73 |   int major = 0, minor = 0;
 74 |   char deviceName[256];
 75 | 
 76 |   // Picks the best CUDA device available
 77 |   CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
 78 | 
 79 |   // get compute capabilities and the devicename
 80 |   checkCudaErrors(cuDeviceGetAttribute(
 81 |       &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
 82 |   checkCudaErrors(cuDeviceGetAttribute(
 83 |       &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
 84 |   
 85 |   {
 86 |   // Compile cubin for the GPU arch on which are going to run cuda kernel.
 87 |   std::string compileOptions;
 88 |   compileOptions = "--gpu-architecture=sm_";
 89 | 
 90 |   compileParams[numCompileOptions] = reinterpret_cast<char *>(
 91 |                   malloc(sizeof(char) * (compileOptions.length() + 10)));
 92 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 93 |   sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 10),
 94 |             "%s%d%d", compileOptions.c_str(), major, minor);
 95 | #else
 96 |   snprintf(compileParams[numCompileOptions], compileOptions.size() + 10, "%s%d%d",
 97 |            compileOptions.c_str(), major, minor);
 98 | #endif
 99 |   }
100 | 
101 |   numCompileOptions++;
102 | 
103 |   if (requiresCGheaders) {
104 |     std::string compileOptions;
105 |     char HeaderNames[256];
106 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
107 |     sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
108 | #else
109 |     snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
110 | #endif
111 | 
112 |     compileOptions = "--include-path=";
113 | 
114 |     std::string path = sdkFindFilePath(HeaderNames, argv[0]);
115 |     if (!path.empty()) {
116 |       std::size_t found = path.find(HeaderNames);
117 |       path.erase(found);
118 |     } else {
119 |       printf(
120 |           "\nCooperativeGroups headers not found, please install it in %s "
121 |           "sample directory..\n Exiting..\n",
122 |           argv[0]);
123 |     }
124 |     compileOptions += path.c_str();
125 |     compileParams[numCompileOptions] = reinterpret_cast<char *>(
126 |         malloc(sizeof(char) * (compileOptions.length() + 1)));
127 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
128 |     sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 1),
129 |               "%s", compileOptions.c_str());
130 | #else
131 |     snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s",
132 |              compileOptions.c_str());
133 | #endif
134 |     numCompileOptions++;
135 |   }
136 | 
137 |   // compile
138 |   nvrtcProgram prog;
139 |   NVRTC_SAFE_CALL("nvrtcCreateProgram",
140 |                   nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL));
141 | 
142 |   nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams);
143 | 
144 |   // dump log
145 |   size_t logSize;
146 |   NVRTC_SAFE_CALL("nvrtcGetProgramLogSize",
147 |                   nvrtcGetProgramLogSize(prog, &logSize));
148 |   char *log = reinterpret_cast<char *>(malloc(sizeof(char) * logSize + 1));
149 |   NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
150 |   log[logSize] = '\x0';
151 | 
152 |   if (strlen(log) >= 2) {
153 |     std::cerr << "\n compilation log ---\n";
154 |     std::cerr << log;
155 |     std::cerr << "\n end log ---\n";
156 |   }
157 | 
158 |   free(log);
159 | 
160 |   NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
161 | 
162 |   size_t codeSize;
163 |   NVRTC_SAFE_CALL("nvrtcGetCUBINSize", nvrtcGetCUBINSize(prog, &codeSize));
164 |   char *code = new char[codeSize];
165 |   NVRTC_SAFE_CALL("nvrtcGetCUBIN", nvrtcGetCUBIN(prog, code));
166 |   *cubinResult = code;
167 |   *cubinResultSize = codeSize;
168 | 
169 |   for (int i = 0; i < numCompileOptions; i++) {
170 |     free(compileParams[i]);
171 |   }
172 | }
173 | 
174 | CUmodule loadCUBIN(char *cubin, int argc, char **argv) {
175 |   CUmodule module;
176 |   CUcontext context;
177 |   int major = 0, minor = 0;
178 |   char deviceName[256];
179 | 
180 |   // Picks the best CUDA device available
181 |   CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
182 | 
183 |   // get compute capabilities and the devicename
184 |   checkCudaErrors(cuDeviceGetAttribute(
185 |       &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
186 |   checkCudaErrors(cuDeviceGetAttribute(
187 |       &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
188 |   checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
189 |   printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
190 | 
191 |   checkCudaErrors(cuInit(0));
192 |   checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
193 | 
194 |   checkCudaErrors(cuModuleLoadData(&module, cubin));
195 |   free(cubin);
196 | 
197 |   return module;
198 | }
199 | 
200 | #endif  // COMMON_NVRTC_HELPER_H_
201 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/single_gpu/jacobi.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <array>
 29 | #include <climits>
 30 | #include <cmath>
 31 | #include <cstdio>
 32 | #include <iostream>
 33 | #include <iterator>
 34 | #include <sstream>
 35 | 
 36 | #include <omp.h>
 37 | #include <nvToolsExt.h>
 38 | 
 39 | #define BLOCK_DIM_X 32
 40 | #define BLOCK_DIM_Y 32
 41 | 
 42 | #define CUDA_RT_CALL(call)                                                                  \
 43 |     {                                                                                       \
 44 |         cudaError_t cudaStatus = call;                                                      \
 45 |         if (cudaSuccess != cudaStatus)                                                      \
 46 |             fprintf(stderr,                                                                 \
 47 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 48 |                     "with "                                                                 \
 49 |                     "%s (%d).\n",                                                           \
 50 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 51 |     }
 52 | 
 53 | constexpr float tol = 1.0e-8;
 54 | 
 55 | const float PI = 2.0 * std::asin(1.0);
 56 | 
 57 | __global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
 58 | 					const int nx, const int my_ny, const int ny) {
 59 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 60 |         const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 61 |         a[iy * nx + 0] = y0;
 62 |         a[iy * nx + (nx - 1)] = y0;
 63 |         a_new[iy * nx + 0] = y0;
 64 |         a_new[iy * nx + (nx - 1)] = y0;
 65 |     }
 66 | }
 67 | 
 68 | __global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
 69 |                               const int iy_end, const int nx) {
 70 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 71 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 72 |     __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
 73 |     unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
 74 | 
 75 |     if (iy < iy_end && ix < (nx - 1)) {
 76 | 	// Update grid point
 77 |         const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 78 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 79 |         a_new[iy * nx + ix] = new_val;
 80 | 	float residue = new_val - a[iy * nx + ix];
 81 | 	// Set block-level L2 norm value for this grid point
 82 | 	block_l2_sum[thread_index] = residue * residue;
 83 |     }
 84 |     else {
 85 | 	block_l2_sum[thread_index] = 0;
 86 |     }
 87 |     // Reduce L2 norm for the block in parallel
 88 |     for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
 89 | 	__syncthreads();
 90 | 	if ((thread_index) % (2*stride) == 0) {
 91 |     	    block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
 92 | 	}
 93 |     }
 94 |     // Atomically update global L2 norm with block-reduced L2 norm
 95 |     if (thread_index == 0) {
 96 | 	atomicAdd(l2_norm, block_l2_sum[0]);
 97 |     }
 98 | }
 99 | 
100 | int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
101 |     int argval = default_val;
102 |     char** itr = std::find(begin, end, arg);
103 |     if (itr != end && ++itr != end) {
104 |         std::istringstream inbuf(*itr);
105 |         inbuf >> argval;
106 |     }
107 |     return argval;
108 | }
109 | 
110 | double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
111 | 
112 | int main(int argc, char* argv[]) {
113 |     const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
114 |     const int nx = get_argval(argv, argv + argc, "-nx", 16384);
115 |     const int ny = get_argval(argv, argv + argc, "-ny", 16384);
116 | 
117 |     CUDA_RT_CALL(cudaSetDevice(0));
118 |     CUDA_RT_CALL(cudaFree(0));
119 | 
120 |     float* a_ref_h;
121 |     CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
122 |     
123 |     double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
124 | 
125 |     printf("%dx%d: 1 GPU: %8.4f s\n", nx, ny, runtime_serial);
126 | 
127 |     return 0;
128 | }
129 | 
130 | double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
131 |     float* a;
132 |     float* a_new;
133 | 
134 |     float* l2_norm_d;
135 |     float* l2_norm_h;
136 | 
137 |     int iy_start = 1;
138 |     int iy_end = (ny - 1);
139 | 
140 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
141 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
142 | 
143 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
144 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
145 | 
146 |     // Set diriclet boundary conditions on left and right boarder
147 |     nvtxRangePush("Init boundaries");
148 |     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
149 |     CUDA_RT_CALL(cudaGetLastError());
150 |     CUDA_RT_CALL(cudaDeviceSynchronize());
151 |     nvtxRangePop();
152 | 
153 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
154 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
155 | 
156 |     CUDA_RT_CALL(cudaDeviceSynchronize());
157 | 
158 |     printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
159 | 
160 |     dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
161 |     dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
162 | 
163 |     int iter = 0;
164 |     float l2_norm = 1.0;
165 | 
166 |     double start = omp_get_wtime();
167 |     nvtxRangePush("Jacobi Solve");
168 |     while (l2_norm > tol && iter < iter_max) {
169 |         CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
170 | 
171 | 	// Compute grid points for this iteration
172 |         jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
173 |        	CUDA_RT_CALL(cudaGetLastError());
174 |         CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
175 | 
176 |         // Apply periodic boundary conditions
177 | 
178 |         CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
179 |                                      cudaMemcpyDeviceToDevice));
180 |         CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
181 |                                      cudaMemcpyDeviceToDevice));
182 | 
183 | 	CUDA_RT_CALL(cudaDeviceSynchronize());
184 | 	l2_norm = *l2_norm_h;
185 | 	l2_norm = std::sqrt(l2_norm);
186 | 
187 |         iter++;
188 | 	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
189 | 
190 |         std::swap(a_new, a);
191 |     }
192 |     nvtxRangePop();
193 |     double stop = omp_get_wtime();
194 | 
195 |     CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
196 | 
197 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
198 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
199 | 
200 |     CUDA_RT_CALL(cudaFree(a_new));
201 |     CUDA_RT_CALL(cudaFree(a));
202 |     return (stop - start);
203 | }
204 | 
205 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Exceptions.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_EXCEPTIONS_H
 29 | #define NV_UTIL_NPP_EXCEPTIONS_H
 30 | 
 31 | 
 32 | #include <string>
 33 | #include <sstream>
 34 | #include <iostream>
 35 | 
 36 | /// All npp related C++ classes are put into the npp namespace.
 37 | namespace npp
 38 | {
 39 | 
 40 |     /// Exception base class.
 41 |     ///     This exception base class will be used for everything C++ throught
 42 |     /// the NPP project.
 43 |     ///     The exception contains a string message, as well as data fields for a string
 44 |     /// containing the name of the file as well as the line number where the exception was thrown.
 45 |     ///     The easiest way of throwing exceptions and providing filename and line number is
 46 |     /// to use one of the ASSERT macros defined for that purpose.
 47 |     class Exception
 48 |     {
 49 |         public:
 50 |             /// Constructor.
 51 |             /// \param rMessage A message with information as to why the exception was thrown.
 52 |             /// \param rFileName The name of the file where the exception was thrown.
 53 |             /// \param nLineNumber Line number in the file where the exception was thrown.
 54 |             explicit
 55 |             Exception(const std::string &rMessage = "", const std::string &rFileName = "", unsigned int nLineNumber = 0)
 56 |                 : sMessage_(rMessage), sFileName_(rFileName), nLineNumber_(nLineNumber)
 57 |             { };
 58 | 
 59 |             Exception(const Exception &rException)
 60 |                 : sMessage_(rException.sMessage_), sFileName_(rException.sFileName_), nLineNumber_(rException.nLineNumber_)
 61 |             { };
 62 | 
 63 |             virtual
 64 |             ~Exception()
 65 |             { };
 66 | 
 67 |             /// Get the exception's message.
 68 |             const
 69 |             std::string &
 70 |             message()
 71 |             const
 72 |             {
 73 |                 return sMessage_;
 74 |             }
 75 | 
 76 |             /// Get the exception's file info.
 77 |             const
 78 |             std::string &
 79 |             fileName()
 80 |             const
 81 |             {
 82 |                 return sFileName_;
 83 |             }
 84 | 
 85 |             /// Get the exceptions's line info.
 86 |             unsigned int
 87 |             lineNumber()
 88 |             const
 89 |             {
 90 |                 return nLineNumber_;
 91 |             }
 92 | 
 93 | 
 94 |             /// Create a clone of this exception.
 95 |             ///      This creates a new Exception object on the heap. It is
 96 |             /// the responsibility of the user of this function to free this memory
 97 |             /// (delete x).
 98 |             virtual
 99 |             Exception *
100 |             clone()
101 |             const
102 |             {
103 |                 return new Exception(*this);
104 |             }
105 | 
106 |             /// Create a single string with all the exceptions information.
107 |             ///     The virtual toString() method is used by the operator<<()
108 |             /// so that all exceptions derived from this base-class can print
109 |             /// their full information correctly even if a reference to their
110 |             /// exact type is not had at the time of printing (i.e. the basic
111 |             /// operator<<() is used).
112 |             virtual
113 |             std::string
114 |             toString()
115 |             const
116 |             {
117 |                 std::ostringstream oOutputString;
118 |                 oOutputString << fileName() << ":" << lineNumber() << ": " << message();
119 |                 return oOutputString.str();
120 |             }
121 | 
122 |         private:
123 |             std::string sMessage_;      ///< Message regarding the cause of the exception.
124 |             std::string sFileName_;     ///< Name of the file where the exception was thrown.
125 |             unsigned int nLineNumber_;  ///< Line number in the file where the exception was thrown
126 |     };
127 | 
128 |     /// Output stream inserter for Exception.
129 |     /// \param rOutputStream The stream the exception information is written to.
130 |     /// \param rException The exception that's being written.
131 |     /// \return Reference to the output stream being used.
132 |     std::ostream &
133 |     operator << (std::ostream &rOutputStream, const Exception &rException)
134 |     {
135 |         rOutputStream << rException.toString();
136 |         return rOutputStream;
137 |     }
138 | 
139 |     /// Basic assert macro.
140 |     ///     This macro should be used to enforce any kind of pre or post conditions.
141 |     /// Unlike the C-runtime assert macro, this macro does not abort execution, but throws
142 |     /// a C++ exception. The exception is automatically filled with information about the failing
143 |     /// condition, the filename and line number where the exception was thrown.
144 |     /// \note The macro is written in such a way that omitting a semicolon after its usage
145 |     ///     causes a compiler error. The correct way to invoke this macro is:
146 |     /// NPP_ASSERT(n < MAX);
147 | #define NPP_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " assertion faild!", __FILE__, __LINE__);} while(false)
148 | 
149 |     // ASSERT macro.
150 |     //  Same functionality as the basic assert macro with the added ability to pass
151 |     //  a message M. M should be a string literal.
152 |     //  Note: Never use code inside ASSERT() that causes a side-effect ASSERT macros may get compiled
153 |     //      out in release mode.
154 | #define NPP_ASSERT_MSG(C, M) do {if (!(C)) throw npp::Exception(#C " assertion faild! Message: " M, __FILE__, __LINE__);} while(false)
155 | 
156 | #ifdef _DEBUG
157 |     /// Basic debug assert macro.
158 |     ///     This macro is identical in every respect to NPP_ASSERT(C) but it does get compiled to a
159 |     /// no-op in release builds. It is therefor of utmost importance to not put statements into
160 |     /// this macro that cause side effects required for correct program execution.
161 | #define NPP_DEBUG_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " debug assertion faild!", __FILE__, __LINE__);} while(false)
162 | #else
163 | #define NPP_DEBUG_ASSERT(C)
164 | #endif
165 | 
166 |     /// ASSERT for null-pointer test.
167 |     /// It is safe to put code with side effects into this macro. Also: This macro never
168 |     /// gets compiled to a no-op because resource allocation may fail based on external causes not under
169 |     /// control of a software developer.
170 | #define NPP_ASSERT_NOT_NULL(P) do {if ((P) == 0) throw npp::Exception(#P " not null assertion faild!", __FILE__, __LINE__);} while(false)
171 | 
172 |     /// Macro for flagging methods as not implemented.
173 |     /// The macro throws an exception with a message that an implementation was missing
174 | #define NPP_NOT_IMPLEMENTED() do {throw npp::Exception("Implementation missing!", __FILE__, __LINE__);} while(false)
175 | 
176 |     /// Macro for checking error return code of CUDA (runtime) calls.
177 |     /// This macro never gets disabled.
178 | #define NPP_CHECK_CUDA(S) do {cudaError_t eCUDAResult; \
179 |         eCUDAResult = S; \
180 |         if (eCUDAResult != cudaSuccess) std::cout << "NPP_CHECK_CUDA - eCUDAResult = " << eCUDAResult << std::endl; \
181 |         NPP_ASSERT(eCUDAResult == cudaSuccess);} while (false)
182 | 
183 |     /// Macro for checking error return code for NPP calls.
184 | #define NPP_CHECK_NPP(S) do {NppStatus eStatusNPP; \
185 |         eStatusNPP = S; \
186 |         if (eStatusNPP != NPP_SUCCESS) std::cout << "NPP_CHECK_NPP - eStatusNPP = " << _cudaGetErrorEnum(eStatusNPP) << "("<< eStatusNPP << ")" << std::endl; \
187 |         NPP_ASSERT(eStatusNPP == NPP_SUCCESS);} while (false)
188 | 
189 |     /// Macro for checking error return codes from cuFFT calls.
190 | #define NPP_CHECK_CUFFT(S) do {cufftResult eCUFFTResult; \
191 |         eCUFFTResult = S; \
192 |         if (eCUFFTResult != NPP_SUCCESS) std::cout << "NPP_CHECK_CUFFT - eCUFFTResult = " << eCUFFTResult << std::endl; \
193 |         NPP_ASSERT(eCUFFTResult == CUFFT_SUCCESS);} while (false)
194 | 
195 | } // npp namespace
196 | 
197 | #endif // NV_UTIL_NPP_EXCEPTIONS_H
198 | 


--------------------------------------------------------------------------------
/slurm_pmi_config/include/slurm_errno.h:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************\
  2 |  *  slurm_errno.h - error codes and functions for slurm
  3 |  ******************************************************************************
  4 |  *  Copyright (C) 2002-2007 The Regents of the University of California.
  5 |  *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
  6 |  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7 |  *  Written by Kevin Tew <tew1@llnl.gov>,
  8 |  *	Jim Garlick <garlick@llnl.gov>, et. al.
  9 |  *  CODE-OCEC-09-009. All rights reserved.
 10 |  *
 11 |  *  This file is part of Slurm, a resource management program.
 12 |  *  For details, see <https://slurm.schedmd.com/>.
 13 |  *  Please also read the included file: DISCLAIMER.
 14 |  *
 15 |  *  Slurm is free software; you can redistribute it and/or modify it under
 16 |  *  the terms of the GNU General Public License as published by the Free
 17 |  *  Software Foundation; either version 2 of the License, or (at your option)
 18 |  *  any later version.
 19 |  *
 20 |  *  In addition, as a special exception, the copyright holders give permission
 21 |  *  to link the code of portions of this program with the OpenSSL library under
 22 |  *  certain conditions as described in each individual source file, and
 23 |  *  distribute linked combinations including the two. You must obey the GNU
 24 |  *  General Public License in all respects for all of the code used other than
 25 |  *  OpenSSL. If you modify file(s) with this exception, you may extend this
 26 |  *  exception to your version of the file(s), but you are not obligated to do
 27 |  *  so. If you do not wish to do so, delete this exception statement from your
 28 |  *  version.  If you delete this exception statement from all source files in
 29 |  *  the program, then also delete it here.
 30 |  *
 31 |  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 32 |  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 33 |  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 34 |  *  details.
 35 |  *
 36 |  *  You should have received a copy of the GNU General Public License along
 37 |  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
 38 |  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 39 | \*****************************************************************************/
 40 | #ifndef _SLURM_ERRNO_H
 41 | #define _SLURM_ERRNO_H
 42 | 
 43 | #ifdef __cplusplus
 44 | extern "C" {
 45 | #endif
 46 | 
 47 | #include <errno.h>
 48 | 
 49 | /* set errno to the specified value - then return -1 */
 50 | #define slurm_seterrno_ret(errnum) do { \
 51 | 	slurm_seterrno(errnum);         \
 52 | 	return (errnum ? -1 : 0);       \
 53 | 	} while (0)
 54 | 
 55 | /* general return codes */
 56 | #define SLURM_SUCCESS   0
 57 | #define SLURM_ERROR    -1
 58 | 
 59 | enum {
 60 | 	/* General Message error codes */
 61 | 	SLURM_UNEXPECTED_MSG_ERROR = 			1000,
 62 | 	SLURM_COMMUNICATIONS_CONNECTION_ERROR,
 63 | 	SLURM_COMMUNICATIONS_SEND_ERROR,
 64 | 	SLURM_COMMUNICATIONS_RECEIVE_ERROR,
 65 | 	SLURM_COMMUNICATIONS_SHUTDOWN_ERROR,
 66 | 	SLURM_PROTOCOL_VERSION_ERROR,
 67 | 	SLURM_PROTOCOL_IO_STREAM_VERSION_ERROR,
 68 | 	SLURM_PROTOCOL_AUTHENTICATION_ERROR,
 69 | 	SLURM_PROTOCOL_INSANE_MSG_LENGTH,
 70 | 	SLURM_MPI_PLUGIN_NAME_INVALID,
 71 | 	SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED,
 72 | 	SLURM_PLUGIN_NAME_INVALID,
 73 | 	SLURM_UNKNOWN_FORWARD_ADDR,
 74 | 
 75 | 	/* communication failures to/from slurmctld */
 76 | 	SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR =     1800,
 77 | 	SLURMCTLD_COMMUNICATIONS_SEND_ERROR,
 78 | 	SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR,
 79 | 	SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR,
 80 | 
 81 | 	/* _info.c/communication layer RESPONSE_SLURM_RC message codes */
 82 | 	SLURM_NO_CHANGE_IN_DATA =			1900,
 83 | 
 84 | 	/* slurmctld error codes */
 85 | 	ESLURM_INVALID_PARTITION_NAME = 		2000,
 86 | 	ESLURM_DEFAULT_PARTITION_NOT_SET,
 87 | 	ESLURM_ACCESS_DENIED,
 88 | 	ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP,
 89 | 	ESLURM_REQUESTED_NODES_NOT_IN_PARTITION,
 90 | 	ESLURM_TOO_MANY_REQUESTED_CPUS,
 91 | 	ESLURM_INVALID_NODE_COUNT,
 92 | 	ESLURM_ERROR_ON_DESC_TO_RECORD_COPY,
 93 | 	ESLURM_JOB_MISSING_SIZE_SPECIFICATION,
 94 | 	ESLURM_JOB_SCRIPT_MISSING,
 95 | 	ESLURM_USER_ID_MISSING =			2010,
 96 | 	ESLURM_DUPLICATE_JOB_ID,
 97 | 	ESLURM_PATHNAME_TOO_LONG,
 98 | 	ESLURM_NOT_TOP_PRIORITY,
 99 | 	ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE,
100 | 	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE,
101 | 	ESLURM_NODES_BUSY,
102 | 	ESLURM_INVALID_JOB_ID,
103 | 	ESLURM_INVALID_NODE_NAME,
104 | 	ESLURM_WRITING_TO_FILE,
105 | 	ESLURM_TRANSITION_STATE_NO_UPDATE =		2020,
106 | 	ESLURM_ALREADY_DONE,
107 | 	ESLURM_INTERCONNECT_FAILURE,
108 | 	ESLURM_BAD_DIST,
109 | 	ESLURM_JOB_PENDING,
110 | 	ESLURM_BAD_TASK_COUNT,
111 | 	ESLURM_INVALID_JOB_CREDENTIAL,
112 | 	ESLURM_IN_STANDBY_MODE,
113 | 	ESLURM_INVALID_NODE_STATE,
114 | 	ESLURM_INVALID_FEATURE,
115 | 	ESLURM_INVALID_AUTHTYPE_CHANGE =		2030,
116 | 	ESLURM_ACTIVE_FEATURE_NOT_SUBSET,
117 | 	ESLURM_INVALID_SCHEDTYPE_CHANGE,
118 | 	ESLURM_INVALID_SELECTTYPE_CHANGE,
119 | 	ESLURM_INVALID_SWITCHTYPE_CHANGE,
120 | 	ESLURM_FRAGMENTATION,
121 | 	ESLURM_NOT_SUPPORTED,
122 | 	ESLURM_DISABLED,
123 | 	ESLURM_DEPENDENCY,
124 | 	ESLURM_BATCH_ONLY,
125 | 	ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED =		2040,
126 | 	ESLURM_TASKDIST_REQUIRES_OVERCOMMIT,
127 | 	ESLURM_JOB_HELD,
128 | 	ESLURM_INVALID_CRED_TYPE_CHANGE,
129 | 	ESLURM_INVALID_TASK_MEMORY,
130 | 	ESLURM_INVALID_ACCOUNT,
131 | 	ESLURM_INVALID_PARENT_ACCOUNT,
132 | 	ESLURM_SAME_PARENT_ACCOUNT,
133 | 	ESLURM_INVALID_LICENSES,
134 | 	ESLURM_NEED_RESTART,
135 | 	ESLURM_ACCOUNTING_POLICY =			2050,
136 | 	ESLURM_INVALID_TIME_LIMIT,
137 | 	ESLURM_RESERVATION_ACCESS,
138 | 	ESLURM_RESERVATION_INVALID,
139 | 	ESLURM_INVALID_TIME_VALUE,
140 | 	ESLURM_RESERVATION_BUSY,
141 | 	ESLURM_RESERVATION_NOT_USABLE,
142 | 	ESLURM_INVALID_WCKEY,
143 | 	ESLURM_RESERVATION_OVERLAP,
144 | 	ESLURM_PORTS_BUSY,
145 | 	ESLURM_PORTS_INVALID =				2060,
146 | 	ESLURM_PROLOG_RUNNING,
147 | 	ESLURM_NO_STEPS,
148 | 	ESLURM_INVALID_BLOCK_STATE,
149 | 	ESLURM_INVALID_BLOCK_LAYOUT,
150 | 	ESLURM_INVALID_BLOCK_NAME,
151 | 	ESLURM_INVALID_QOS,
152 | 	ESLURM_QOS_PREEMPTION_LOOP,
153 | 	ESLURM_NODE_NOT_AVAIL,
154 | 	ESLURM_INVALID_CPU_COUNT,
155 | 	ESLURM_PARTITION_NOT_AVAIL =			2070,
156 | 	ESLURM_CIRCULAR_DEPENDENCY,
157 | 	ESLURM_INVALID_GRES,
158 | 	ESLURM_JOB_NOT_PENDING,
159 | 	ESLURM_QOS_THRES,
160 | 	ESLURM_PARTITION_IN_USE,
161 | 	ESLURM_STEP_LIMIT,
162 | 	ESLURM_JOB_SUSPENDED,
163 | 	ESLURM_CAN_NOT_START_IMMEDIATELY,
164 | 	ESLURM_INTERCONNECT_BUSY,
165 | 	ESLURM_RESERVATION_EMPTY =			2080,
166 | 	ESLURM_INVALID_ARRAY,
167 | 	ESLURM_RESERVATION_NAME_DUP,
168 | 	ESLURM_JOB_STARTED,
169 | 	ESLURM_JOB_FINISHED,
170 | 	ESLURM_JOB_NOT_RUNNING,
171 | 	ESLURM_JOB_NOT_PENDING_NOR_RUNNING,
172 | 	ESLURM_JOB_NOT_SUSPENDED,
173 | 	ESLURM_JOB_NOT_FINISHED,
174 | 	ESLURM_TRIGGER_DUP,
175 | 	ESLURM_INTERNAL =				2090,
176 | 	ESLURM_INVALID_BURST_BUFFER_CHANGE,
177 | 	ESLURM_BURST_BUFFER_PERMISSION,
178 | 	ESLURM_BURST_BUFFER_LIMIT,
179 | 	ESLURM_INVALID_BURST_BUFFER_REQUEST,
180 | 	ESLURM_PRIO_RESET_FAIL,
181 | 	ESLURM_POWER_NOT_AVAIL,
182 | 	ESLURM_POWER_RESERVED,
183 | 	ESLURM_INVALID_POWERCAP,
184 | 	ESLURM_INVALID_MCS_LABEL,
185 | 	ESLURM_BURST_BUFFER_WAIT =			2100,
186 | 	ESLURM_PARTITION_DOWN,
187 | 	ESLURM_DUPLICATE_GRES,
188 | 	ESLURM_JOB_SETTING_DB_INX,
189 | 	ESLURM_RSV_ALREADY_STARTED,
190 | 	ESLURM_SUBMISSIONS_DISABLED,
191 | 	ESLURM_NOT_HET_JOB,
192 | 	ESLURM_NOT_HET_JOB_LEADER,
193 | 	ESLURM_NOT_WHOLE_HET_JOB,
194 | 	ESLURM_CORE_RESERVATION_UPDATE,
195 | 	ESLURM_DUPLICATE_STEP_ID =			2110,
196 | 	ESLURM_INVALID_CORE_CNT,
197 | 	ESLURM_X11_NOT_AVAIL,
198 | 	ESLURM_GROUP_ID_MISSING,
199 | 	ESLURM_BATCH_CONSTRAINT,
200 | 	ESLURM_INVALID_TRES,
201 | 	ESLURM_INVALID_TRES_BILLING_WEIGHTS,
202 | 	ESLURM_INVALID_JOB_DEFAULTS,
203 | 	ESLURM_RESERVATION_MAINT,
204 | 	ESLURM_INVALID_GRES_TYPE,
205 | 	ESLURM_REBOOT_IN_PROGRESS =			2120,
206 | 	ESLURM_MULTI_KNL_CONSTRAINT,
207 | 	ESLURM_UNSUPPORTED_GRES,
208 | 	ESLURM_INVALID_NICE,
209 | 	ESLURM_INVALID_TIME_MIN_LIMIT,
210 | 	ESLURM_DEFER,
211 | 	ESLURM_CONFIGLESS_DISABLED,
212 | 	ESLURM_ENVIRONMENT_MISSING,
213 | 
214 | 	/* slurmd error codes */
215 | 	ESLURMD_PIPE_ERROR_ON_TASK_SPAWN =		4000,
216 | 	ESLURMD_KILL_TASK_FAILED,
217 | 	ESLURMD_KILL_JOB_ALREADY_COMPLETE,
218 | 	ESLURMD_INVALID_ACCT_FREQ,
219 | 	ESLURMD_INVALID_JOB_CREDENTIAL,
220 | 	ESLURMD_UID_NOT_FOUND,
221 | 	ESLURMD_GID_NOT_FOUND,
222 | 	ESLURMD_CREDENTIAL_EXPIRED,
223 | 	ESLURMD_CREDENTIAL_REVOKED,
224 | 	ESLURMD_CREDENTIAL_REPLAYED,
225 | 	ESLURMD_CREATE_BATCH_DIR_ERROR =		4010,
226 | 	ESLURMD_MODIFY_BATCH_DIR_ERROR,
227 | 	ESLURMD_CREATE_BATCH_SCRIPT_ERROR,
228 | 	ESLURMD_MODIFY_BATCH_SCRIPT_ERROR,
229 | 	ESLURMD_SETUP_ENVIRONMENT_ERROR,
230 | 	ESLURMD_SHARED_MEMORY_ERROR,
231 | 	ESLURMD_SET_UID_OR_GID_ERROR,
232 | 	ESLURMD_SET_SID_ERROR,
233 | 	ESLURMD_CANNOT_SPAWN_IO_THREAD,
234 | 	ESLURMD_FORK_FAILED,
235 | 	ESLURMD_EXECVE_FAILED =				4020,
236 | 	ESLURMD_IO_ERROR,
237 | 	ESLURMD_PROLOG_FAILED,
238 | 	ESLURMD_EPILOG_FAILED,
239 | 	ESLURMD_SESSION_KILLED,
240 | 	ESLURMD_TOOMANYSTEPS,
241 | 	ESLURMD_STEP_EXISTS,
242 | 	ESLURMD_JOB_NOTRUNNING,
243 | 	ESLURMD_STEP_SUSPENDED,
244 | 	ESLURMD_STEP_NOTSUSPENDED,
245 | 	ESLURMD_INVALID_SOCKET_NAME_LEN =		4030,
246 | 
247 | 	/* slurmd errors in user batch job */
248 | 	ESCRIPT_CHDIR_FAILED =			4100,
249 | 	ESCRIPT_OPEN_OUTPUT_FAILED,
250 | 	ESCRIPT_NON_ZERO_RETURN,
251 | 
252 | 	/* socket specific Slurm communications error */
253 | 	SLURM_PROTOCOL_SOCKET_IMPL_ZERO_RECV_LENGTH =	5000,
254 | 	SLURM_PROTOCOL_SOCKET_IMPL_NEGATIVE_RECV_LENGTH,
255 | 	SLURM_PROTOCOL_SOCKET_IMPL_NOT_ALL_DATA_SENT,
256 | 	ESLURM_PROTOCOL_INCOMPLETE_PACKET ,
257 | 	SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT ,
258 | 	SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT,
259 | 
260 | 	/* slurm_auth errors */
261 | 	ESLURM_AUTH_CRED_INVALID	= 6000,
262 | 	ESLURM_AUTH_FOPEN_ERROR,
263 | 	ESLURM_AUTH_NET_ERROR,
264 | 	ESLURM_AUTH_UNABLE_TO_SIGN,
265 | 	ESLURM_AUTH_BADARG,
266 | 	ESLURM_AUTH_MEMORY,
267 | 	ESLURM_AUTH_INVALID,
268 | 	ESLURM_AUTH_UNPACK,
269 | 
270 | 	/* accounting errors */
271 | 	ESLURM_DB_CONNECTION            = 7000,
272 | 	ESLURM_JOBS_RUNNING_ON_ASSOC,
273 | 	ESLURM_CLUSTER_DELETED,
274 | 	ESLURM_ONE_CHANGE,
275 | 	ESLURM_BAD_NAME,
276 | 	ESLURM_OVER_ALLOCATE,
277 | 	ESLURM_RESULT_TOO_LARGE,
278 | 	ESLURM_DB_QUERY_TOO_WIDE,
279 | 
280 | 	/* Federation Errors */
281 | 	ESLURM_FED_CLUSTER_MAX_CNT              = 7100,
282 | 	ESLURM_FED_CLUSTER_MULTIPLE_ASSIGNMENT,
283 | 	ESLURM_INVALID_CLUSTER_FEATURE,
284 | 	ESLURM_JOB_NOT_FEDERATED,
285 | 	ESLURM_INVALID_CLUSTER_NAME,
286 | 	ESLURM_FED_JOB_LOCK,
287 | 	ESLURM_FED_NO_VALID_CLUSTERS,
288 | 
289 | 	/* plugin and custom errors */
290 | 	ESLURM_MISSING_TIME_LIMIT       = 8000,
291 | 	ESLURM_INVALID_KNL
292 | };
293 | 
294 | /* look up an errno value */
295 | char * slurm_strerror(int errnum);
296 | 
297 | /* set an errno value */
298 | void slurm_seterrno(int errnum);
299 | 
300 | /* get an errno value */
301 | int slurm_get_errno(void);
302 | 
303 | /* print message: error string for current errno value */
304 | void slurm_perror(const char *msg);
305 | 
306 | #ifdef __cplusplus
307 | }
308 | #endif
309 | 
310 | #endif /* !_SLURM_ERRNO_H */
311 | 


--------------------------------------------------------------------------------
/labs/CFD/English/C/jupyter_notebook/mpi/multi_node_intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Learning objectives\n",
  8 |     "\n",
  9 |     "In this lab we will learn about:\n",
 10 |     "\n",
 11 |     "* Multi-node Multi-GPU programming and importance of inter-process communication frameworks.\n",
 12 |     "* Introduction to MPI specification and APIs.\n",
 13 |     "* Execution of Hello World MPI binary on single as well as multiple nodes.\n",
 14 |     "\n",
 15 |     "# Multi-Node Multi-GPU Programming\n",
 16 |     "\n",
 17 |     "As we move from a single node to multiple nodes, the basic multi-GPU programming concepts like domain decomposition and application-specific concepts like halo exchange remain the same. However, the communication becomes complex.\n",
 18 |     "\n",
 19 |     "A single process can spawn threads that can be spread within a node (potentially on multiple sockets) but it cannot cross the node boundary. Thus, scalable multi-node programming requires the use of multiple processes.\n",
 20 |     "\n",
 21 |     "Inter-process communication is usually done by libraries like OpenMPI. They expose communication APIs, synchronization constructs, etc. to the user. Let us now learn about programming in MPI.\n",
 22 |     "\n",
 23 |     "## MPI\n",
 24 |     "\n",
 25 |     "MPI is a specification for the developers and users of message passing libraries. By itself, it is not a library - but rather the specification of what such a library should be. An example of MPI-compliant library is OpenMPI.\n",
 26 |     "\n",
 27 |     "It primarily addresses the message-passing parallel programming model: data is moved from the address space of one process to that of another process through cooperative operations on each process.\n",
 28 |     "\n",
 29 |     "MPI is widely used in practice for HPC applications, in academia, government agencies, and industry alike. In this lab, while we will introduce its APIs, a working understanding of MPI is highly desirable.\n",
 30 |     "\n",
 31 |     "### A Hello World Example\n",
 32 |     "\n",
 33 |     "A C-based Hello World program is shown below:\n",
 34 |     "\n",
 35 |     "```c\n",
 36 |     "#include <mpi.h>\n",
 37 |     "#include <stdio.h>\n",
 38 |     "\n",
 39 |     "int main(int argc, char** argv) {\n",
 40 |     "    // Initialize the MPI environment\n",
 41 |     "    MPI_Init(NULL, NULL);\n",
 42 |     "    // Get the number of processes\n",
 43 |     "    int size;\n",
 44 |     "    MPI_Comm_size(MPI_COMM_WORLD, &size);\n",
 45 |     "    // Get the rank of the process\n",
 46 |     "    int rank;\n",
 47 |     "    MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n",
 48 |     "    // Get the name of the processor\n",
 49 |     "    char processor_name[MPI_MAX_PROCESSOR_NAME];\n",
 50 |     "    int name_len;\n",
 51 |     "    MPI_Get_processor_name(processor_name, &name_len);\n",
 52 |     "    // Print a hello world message\n",
 53 |     "    printf(\"Hello world from processor %s, rank %d out of %d processors\\n\",\n",
 54 |     "           processor_name, rank, size);\n",
 55 |     "    // Finalize the MPI environment.\n",
 56 |     "    MPI_Finalize();\n",
 57 |     "}\n",
 58 |     "```\n",
 59 |     "\n",
 60 |     "To access the program, open the [hello_world.c](../../source_code/mpi/hello_world.c) file. Alternatively, you can navigate to `CFD/English/C/source_code/mpi/` directory in Jupyter's file browser in the left pane. Then, click to open the `hello_world.c` file.\n",
 61 |     "\n",
 62 |     "The MPI environment is initialized with `MPI_Init` through which all of MPI’s global and internal variables are constructed. A \"communicator\" is created between all processes that are spawned, and unique ranks are assigned to each process. \n",
 63 |     "\n",
 64 |     "`MPI_Comm_size` returns the size of a communicator, that is, the number of processes within that communicator. In our example, this call will return the number of processes requested for the job.\n",
 65 |     "\n",
 66 |     "`MPI_Comm_rank` returns the rank of a process in a communicator. Each process inside of a communicator is assigned an incremental rank starting from zero. The ranks of the processes are primarily used for identification purposes when sending and receiving messages.\n",
 67 |     "\n",
 68 |     "`MPI_Get_processor_name` obtains the name of the processor on which the process is executing and `MPI_Finalize` is used to clean up the MPI environment. No more MPI calls can be made after this call.\n",
 69 |     "\n",
 70 |     "## Running MPI with or without containers\n",
 71 |     "\n",
 72 |     "**We will run MPI directly on compute nodes without using containers.** The subsequent sections assume that atleast 2 compute nodes with multiple GPUs in each node are available to the user. All our codes have been tested with CUDA-aware OpenMPI v4.1.1 with supporting libraries HPCX v2.8.1 (for UCX and HCOLL) and CUDA v11.3.0.0 on DGX-1 8 Tesla V100 compute nodes. \n",
 73 |     "\n",
 74 |     "CUDA-awareness as a concept in MPI will be explained in subsequent labs.\n",
 75 |     "\n",
 76 |     "Usually, a cluster workload manager like Slurm or PBS is present and integrated with MPI installation to launch multi-node jobs. We use `mpirun` command to run MPI assuming that the user is logged into an interactive shell with multiple nodes allocated. The other common way is to use workload manager commands like `srun` (for Slurm) directly to run MPI jobs as they are integrated with MPI internally. \n",
 77 |     "\n",
 78 |     "**Note:** We do outline the method to build and run containerized MPI using Singularity in tandem with host MPI implementation in our supplemental notebook: [MPI in a containerized environment](./containers_and_mpi.ipynb). \n",
 79 |     "\n",
 80 |     "### Compilation\n",
 81 |     "\n",
 82 |     "The `mpicc` and `mpic++` (or `mpicxx`) compilers are used to compile and link programs with MPI. We can compile the Hello World program with the command:\n",
 83 |     "\n",
 84 |     "```bash\n",
 85 |     "mpicc -o hello_world hello_world.c\n",
 86 |     "```\n",
 87 |     "\n",
 88 |     "Ensure that MPI is installed (for exmaple, if it is built from source) and available (for example, if loaded as a module) using the folllowing command:"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "! mpirun --version"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "Now, let us compile the program:"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "! cd ../../source_code/mpi && make clean && make hello_world"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "### Execution\n",
121 |     "\n",
122 |     "We run the program using the `mpirun` command as follows:\n",
123 |     "\n",
124 |     "```bash\n",
125 |     "mpirun -np <procs> -npersocket <procs_per_socket> -hostfile <host_file> ./hello_world\n",
126 |     "```\n",
127 |     "\n",
128 |     "The `-np` option specifies the total number of processes spawned by MPI runtime and `-npersocket` option specifies the number of processes to be spawned on each socket. The `-hostfile` option allows us to specify which hosts (compute nodes) to start MPI processes on. The file is a newline-separated list of hostnames which must be accessible to each other so that MPI processes can communicate.\n",
129 |     "\n",
130 |     "Note that DGX-1V is a dual-socket system and `<procs_per_socket>` should be less than or equal to number of cores in that socket. Clearly, `<procs>`$\\div$(`procs_per_socket`$\\times$`<sockets_per_node>`) is the number of nodes used. There are several other options available to specify `<procs_per_socket>` that will be discussed in subsequent labs. As we are using an OpenMPI implementation in a workload manager-based environment, the `<host_file>` will be provided by Slurm and we don't need to specify this option.\n",
131 |     "\n",
132 |     "There are numerous other configuration options that one can overview using the `mpirun --help` command. You can check the number of sockets and cores per socket in your machine (the whole node) with the command `lscpu | grep -E 'Socket|Core'`. \n",
133 |     "\n",
134 |     "### Single Node\n",
135 |     "\n",
136 |     "Run the program binary on a single node:"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "! cd ../../source_code/mpi && mpirun -np 2 -npersocket 1 ./hello_world"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "You may see some warnings. As long as the output is printed, you can ignore the warnings. In the output, you should see 2 unique ranks ranging (0 and 1) and the node's name like below:\n",
153 |     "\n",
154 |     "```bash\n",
155 |     "Hello world from processor <host_name_0>, rank 0 out of 2 processors\n",
156 |     "Hello world from processor <host_name_1>, rank 1 out of 2 processors\n",
157 |     "```\n",
158 |     "\n",
159 |     "### Multiple Nodes\n",
160 |     "\n",
161 |     "Let us now run the Hello World program on 2 nodes with the following command:"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "! cd ../../source_code/mpi && mpirun -np 4 -npersocket 1 ./hello_world"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "The output, excluding warnings, should be as follows (the order of output lines is not important):\n",
178 |     "\n",
179 |     "```bash\n",
180 |     "Hello world from processor <node_0_name>, rank 1 out of 4 processors\n",
181 |     "Hello world from processor <node_1_name>, rank 0 out of 4 processors\n",
182 |     "Hello world from processor <node_1_name>, rank 3 out of 4 processors\n",
183 |     "Hello world from processor <node_0_name>, rank 2 out of 4 processors\n",
184 |     "```\n",
185 |     "\n",
186 |     "**Note:** Subsequent labs will assume the reader understands how to run a multi-node MPI job.\n",
187 |     "\n",
188 |     "Now, let us learn more MPI concepts and code a CUDA Memcpy and MPI-based Jacobi solver. Click below to move to the next lab:\n",
189 |     "\n",
190 |     "# [Next: CUDA Memcpy with MPI](../mpi/memcpy.ipynb)\n",
191 |     "\n",
192 |     "Here's a link to the home notebook through which all other notebooks are accessible:\n",
193 |     "\n",
194 |     "# [HOME](../../../start_here.ipynb)\n",
195 |     "\n",
196 |     "---\n",
197 |     "## Links and Resources\n",
198 |     "\n",
199 |     "* [Programming: MPI Hello World Tutorial](https://mpitutorial.com/tutorials/mpi-hello-world/)\n",
200 |     "* [Programming: OpenMPI Library](https://www.open-mpi.org/)\n",
201 |     "* [Concepts: Singularity Containers with MPI](https://sylabs.io/guides/3.6/user-guide/mpi.html)\n",
202 |     "* [Documentation: mpirun Command](https://www.open-mpi.org/doc/current/man1/mpirun.1.php)\n",
203 |     "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
204 |     "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
205 |     "\n",
206 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
207 |     "\n",
208 |     "## Licensing\n",
209 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply.\n"
210 |    ]
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.7.4"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 5
234 | }
235 | 


--------------------------------------------------------------------------------