├── .clang-format
├── .gitignore
├── LICENSE.md
├── README.md
├── bench.sbatch.sh
├── bench.sh
├── mpi
    ├── Makefile
    ├── jacobi.cpp
    └── jacobi_kernels.cu
├── mpi_overlap
    ├── Makefile
    ├── jacobi.cpp
    └── jacobi_kernels.cu
├── multi_node_p2p
    ├── Makefile
    ├── jacobi.cpp
    └── jacobi_kernels.cu
├── multi_threaded_copy
    ├── Makefile
    └── jacobi.cu
├── multi_threaded_copy_overlap
    ├── Makefile
    └── jacobi.cu
├── multi_threaded_p2p
    ├── Makefile
    └── jacobi.cu
├── multi_threaded_p2p_opt
    ├── Makefile
    └── jacobi.cu
├── multi_threaded_um
    ├── Makefile
    └── jacobi.cu
├── nccl
    ├── Makefile
    ├── jacobi.cpp
    └── jacobi_kernels.cu
├── nccl_graphs
    ├── Makefile
    ├── jacobi.cpp
    └── jacobi_kernels.cu
├── nccl_overlap
    ├── Makefile
    ├── jacobi.cpp
    └── jacobi_kernels.cu
├── nvshmem
    ├── Makefile
    └── jacobi.cu
├── single_gpu
    ├── Makefile
    └── jacobi.cu
├── single_threaded_copy
    ├── Makefile
    └── jacobi.cu
└── test.sh


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -1
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: Yes
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:   
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakInheritanceList: BeforeColon
 43 | BreakBeforeTernaryOperators: true
 44 | BreakConstructorInitializersBeforeComma: false
 45 | BreakConstructorInitializers: BeforeColon
 46 | BreakAfterJavaFieldAnnotations: false
 47 | BreakStringLiterals: true
 48 | ColumnLimit:     100
 49 | CommentPragmas:  '^ IWYU pragma:'
 50 | CompactNamespaces: false
 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 52 | ConstructorInitializerIndentWidth: 4
 53 | ContinuationIndentWidth: 4
 54 | Cpp11BracedListStyle: true
 55 | DerivePointerAlignment: true
 56 | DisableFormat:   false
 57 | ExperimentalAutoDetectBinPacking: false
 58 | FixNamespaceComments: true
 59 | ForEachMacros:   
 60 |   - foreach
 61 |   - Q_FOREACH
 62 |   - BOOST_FOREACH
 63 | IncludeBlocks:   Preserve
 64 | IncludeCategories: 
 65 |   - Regex:           '^<ext/.*\.h>'
 66 |     Priority:        2
 67 |   - Regex:           '^<.*\.h>'
 68 |     Priority:        1
 69 |   - Regex:           '^<.*'
 70 |     Priority:        2
 71 |   - Regex:           '.*'
 72 |     Priority:        3
 73 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 74 | IndentCaseLabels: true
 75 | IndentPPDirectives: None
 76 | IndentWidth:     4
 77 | IndentWrappedFunctionNames: false
 78 | JavaScriptQuotes: Leave
 79 | JavaScriptWrapImports: true
 80 | KeepEmptyLinesAtTheStartOfBlocks: false
 81 | MacroBlockBegin: ''
 82 | MacroBlockEnd:   ''
 83 | MaxEmptyLinesToKeep: 1
 84 | NamespaceIndentation: None
 85 | ObjCBinPackProtocolList: Never
 86 | ObjCBlockIndentWidth: 4
 87 | ObjCSpaceAfterProperty: false
 88 | ObjCSpaceBeforeProtocolList: true
 89 | PenaltyBreakAssignment: 2
 90 | PenaltyBreakBeforeFirstCallParameter: 1
 91 | PenaltyBreakComment: 300
 92 | PenaltyBreakFirstLessLess: 120
 93 | PenaltyBreakString: 1000
 94 | PenaltyBreakTemplateDeclaration: 10
 95 | PenaltyExcessCharacter: 1000000
 96 | PenaltyReturnTypeOnItsOwnLine: 200
 97 | PointerAlignment: Left
 98 | RawStringFormats: 
 99 |   - Language:        Cpp
100 |     Delimiters:      
101 |       - cc
102 |       - CC
103 |       - cpp
104 |       - Cpp
105 |       - CPP
106 |       - 'c++'
107 |       - 'C++'
108 |     CanonicalDelimiter: ''
109 |     BasedOnStyle:    google
110 |   - Language:        TextProto
111 |     Delimiters:      
112 |       - pb
113 |       - PB
114 |       - proto
115 |       - PROTO
116 |     EnclosingFunctions: 
117 |       - EqualsProto
118 |       - EquivToProto
119 |       - PARSE_PARTIAL_TEXT_PROTO
120 |       - PARSE_TEST_PROTO
121 |       - PARSE_TEXT_PROTO
122 |       - ParseTextOrDie
123 |       - ParseTextProtoOrDie
124 |     CanonicalDelimiter: ''
125 |     BasedOnStyle:    google
126 | ReflowComments:  true
127 | SortIncludes:    true
128 | SortUsingDeclarations: true
129 | SpaceAfterCStyleCast: false
130 | SpaceAfterTemplateKeyword: true
131 | SpaceBeforeAssignmentOperators: true
132 | SpaceBeforeCpp11BracedList: false
133 | SpaceBeforeCtorInitializerColon: true
134 | SpaceBeforeInheritanceColon: true
135 | SpaceBeforeParens: ControlStatements
136 | SpaceBeforeRangeBasedForLoopColon: true
137 | SpaceInEmptyParentheses: false
138 | SpacesBeforeTrailingComments: 2
139 | SpacesInAngles:  false
140 | SpacesInContainerLiterals: true
141 | SpacesInCStyleCastParentheses: false
142 | SpacesInParentheses: false
143 | SpacesInSquareBrackets: false
144 | Standard:        Auto
145 | TabWidth:        8
146 | UseTab:          Never
147 | ...
148 | 
149 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | jacobi
3 | test-*.log
4 | .vscode
5 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multi GPU Programming Models
 2 | This project implements the well known multi GPU Jacobi solver with different multi GPU Programming Models:
 3 | * `single_threaded_copy`        Single Threaded using cudaMemcpy for inter GPU communication
 4 | * `multi_threaded_copy`         Multi Threaded with OpenMP using cudaMemcpy for inter GPU communication
 5 | * `multi_threaded_copy_overlap` Multi Threaded with OpenMP using cudaMemcpy for inter GPU communication with overlapping communication
 6 | * `multi_threaded_p2p`          Multi Threaded with OpenMP using GPUDirect P2P mappings for inter GPU communication
 7 | * `multi_threaded_p2p_opt`      Multi Threaded with OpenMP using GPUDirect P2P mappings for inter GPU communication with delayed norm execution
 8 | * `multi_threaded_um`           Multi Threaded with OpenMP relying on transparent peer mappings with Unified Memory for inter GPU communication
 9 | * `mpi`                         Multi Process with MPI using CUDA-aware MPI for inter GPU communication
10 | * `mpi_overlap`                 Multi Process with MPI using CUDA-aware MPI for inter GPU communication with overlapping communication
11 | * `nccl`                        Multi Process with MPI and NCCL using NCCL for inter GPU communication
12 | * `nccl_overlap`                Multi Process with MPI and NCCL using NCCL for inter GPU communication with overlapping communication
13 | * `nccl_graphs`                 Multi Process with MPI and NCCL using NCCL for inter GPU communication with overlapping communication combined with CUDA Graphs
14 | * `nvshmem`                     Multi Process with MPI and NVSHMEM using NVSHMEM for inter GPU communication.
15 | * `multi_node_p2p`              Multi Process Multi Node variant using the low level CUDA Driver [Virtual Memory Management](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-memory-management) and [Multicast Object Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MULTICAST.html#group__CUDA__MULTICAST) APIs. This example is for developers of libraries like NCCL or NVSHMEM. It shows how higher-level programming models like NVSHMEM work internally within a (multinode) NVLINK domain. Application developers generally should use the higher-level MPI, NCCL, or NVSHMEM interfaces instead of this API.
16 | 
17 | Each variant is a stand alone `Makefile` project and most variants have been discussed in various GTC Talks, e.g.:
18 | * `single_threaded_copy`, `multi_threaded_copy`, `multi_threaded_copy_overlap`, `multi_threaded_p2p`, `multi_threaded_p2p_opt`, `mpi`, `mpi_overlap` and `nvshmem` on DGX-1V at GTC Europe 2017 in 23031 - Multi GPU Programming Models
19 | * `single_threaded_copy`, `multi_threaded_copy`, `multi_threaded_copy_overlap`, `multi_threaded_p2p`, `multi_threaded_p2p_opt`, `mpi`, `mpi_overlap` and `nvshmem` on DGX-2 at GTC 2019 in S9139 - Multi GPU Programming Models
20 | * `multi_threaded_copy`, `multi_threaded_copy_overlap`, `multi_threaded_p2p`, `multi_threaded_p2p_opt`, `mpi`, `mpi_overlap`, `nccl`, `nccl_overlap` and `nvshmem`  on DGX A100 at GTC 2021 in [A31140 - Multi-GPU Programming Models](https://www.nvidia.com/en-us/on-demand/session/gtcfall21-a31140/)
21 | 
22 | Some examples in this repository are the basis for an interactive tutorial: [FZJ-JSC/tutorial-multi-gpu](https://github.com/FZJ-JSC/tutorial-multi-gpu). 
23 | 
24 | # Requirements
25 | * CUDA: version 11.0 (9.2 if build with `DISABLE_CUB=1`) or later is required by all variants.
26 |   * `nccl_graphs` requires NCCL 2.15.1, CUDA 11.7 and CUDA Driver 515.65.01 or newer
27 |   * `multi_node_p2p` requires CUDA 12.4, a CUDA Driver 550.54.14 or newer and the NVIDIA IMEX daemon running.
28 | * OpenMP capable compiler: Required by the Multi Threaded variants. The examples have been developed and tested with gcc.
29 | * MPI: The `mpi` and `mpi_overlap` variants require a CUDA-aware[^1] implementation. For NVSHMEM, NCCL and `multi_node_p2p`, a non CUDA-aware MPI is sufficient. The examples have been developed and tested with OpenMPI.
30 | * NVSHMEM (version 0.4.1 or later): Required by the NVSHMEM variant.
31 | * NCCL (version 2.8 or later): Required by the NCCL variant
32 | 
33 | # Building 
34 | Each variant comes with a `Makefile` and can be built by simply issuing `make`, e.g. 
35 | ```sh
36 | multi-gpu-programming-models$ cd multi_threaded_copy
37 | multi_threaded_copy$ make
38 | nvcc -DHAVE_CUB -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -std=c++14 jacobi.cu -o jacobi
39 | multi_threaded_copy$ ls jacobi
40 | jacobi
41 | ```
42 | 
43 | # Run instructions
44 | All variants have the following command line options
45 | * `-niter`: How many iterations to carry out (default 1000)
46 | * `-nccheck`: How often to check for convergence (default 1)
47 | * `-nx`: Size of the domain in x direction (default 16384)
48 | * `-ny`: Size of the domain in y direction (default 16384)
49 | * `-csv`: Print performance results as -csv
50 | * `-use_hp_streams`: In `mpi_overlap` use high priority streams to hide kernel launch latencies of boundary kernels.
51 | 
52 | The `nvshmem` variant additionally provides
53 | * `-use_block_comm`: Use block cooperative `nvshmemx_float_put_nbi_block` instead of `nvshmem_float_p` for communication.
54 | * `-norm_overlap`: Enable delayed norm execution as also implemented in `multi_threaded_p2p_opt` 
55 | * `-neighborhood_sync`: Use custom neighbor only sync instead of `nvshmemx_barrier_all_on_stream`
56 | 
57 | The `multi_node_p2p` variant additionally provides
58 | * `-use_mc_red`: Use a device side barrier and allreduce leveraging Multicast Objects instead of MPI primitives
59 | 
60 | The `nccl` variants additionally provide
61 | * `-user_buffer_reg`: Avoid extra internal copies in NCCL communication with [User Buffer Registration](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#user-buffer-registration). Required NCCL APIs are available with NCCL 2.19.1 or later. NCCL 2.23.4 added support for the used communication pattern.
62 | 
63 | The provided script `bench.sh` contains some examples executing all the benchmarks presented in the GTC Talks referenced above.
64 | 
65 | # Developers guide
66 | The code applies the style guide implemented in [`.clang-format`](.clang-format) file. [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html) version 7 or later should be used to format the code prior to submitting it. E.g. with
67 | ```sh
68 | multi-gpu-programming-models$ cd multi_threaded_copy
69 | multi_threaded_copy$ clang-format -style=file -i jacobi.cu
70 | ```
71 | 
72 | [^1]: A check for CUDA-aware support is done at compile and run time (see [the OpenMPI FAQ](https://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-aware-support) for details). If your CUDA-aware MPI implementation does not support this check, which requires `MPIX_CUDA_AWARE_SUPPORT` and `MPIX_Query_cuda_support()` to be defined in `mpi-ext.h`, it can be skipped by setting `SKIP_CUDA_AWARENESS_CHECK=1`.
73 | 


--------------------------------------------------------------------------------
/bench.sbatch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p batch
 4 | #SBATCH -N 1
 5 | #SBATCH -n 8
 6 | #SBATCH -t 02:00:00
 7 | 
 8 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 9 | #
10 | # Redistribution and use in source and binary forms, with or without
11 | # modification, are permitted provided that the following conditions
12 | # are met:
13 | #  * Redistributions of source code must retain the above copyright
14 | #    notice, this list of conditions and the following disclaimer.
15 | #  * Redistributions in binary form must reproduce the above copyright
16 | #    notice, this list of conditions and the following disclaimer in the
17 | #    documentation and/or other materials provided with the distribution.
18 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
19 | #    contributors may be used to endorse or promote products derived
20 | #    from this software without specific prior written permission.
21 | #
22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
23 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
26 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
30 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 | 
34 | : "${ENROOT_IMG_PATH:=.}"
35 | : "${LUSTRE:=.}"
36 | 
37 | IMG=nvcr.io/nvidia/nvhpc:24.1-devel-cuda12.3-ubuntu22.04
38 | SQUASHFS_IMG=$ENROOT_IMG_PATH/`echo "$IMG" | md5sum | cut -f1 -d " "`
39 | CONTAINER_NAME=HPCSDK-CONTAINER
40 | 
41 | CONTAINER_MNTS=$LUSTRE/workspace/multi-gpu-programming-models:/mnt
42 | 
43 | start=`date`
44 | 
45 | if [[ -f "$SQUASHFS_IMG" ]]; then
46 |     echo "Using: $SQUASHFS_IMG"
47 | else
48 |     echo "Fetching $IMG to $SQUASHFS_IMG"
49 |     srun -n 1 -N 1 --ntasks-per-node=1 enroot import -o $SQUASHFS_IMG docker://$IMG
50 |     echo "$IMG" > "${SQUASHFS_IMG}.url"
51 | fi
52 | 
53 | CONTAINER_IMG=$SQUASHFS_IMG
54 | 
55 | if [[ ! -f "$CONTAINER_IMG" ]]; then
56 |     echo "Falling back to $IMG"
57 |     CONTAINER_IMG=$IMG
58 | fi
59 | 
60 | # Pulling container on all nodes
61 | srun -N ${SLURM_JOB_NUM_NODES} \
62 |      -n ${SLURM_JOB_NUM_NODES} \
63 |      --ntasks-per-node=1 \
64 |      --container-image=$CONTAINER_IMG \
65 |      --container-name=$CONTAINER_NAME \
66 |      true
67 | 
68 | export SRUN_ARGS="--cpu-bind=none --mpi=none --no-container-remap-root --container-mounts=$CONTAINER_MNTS --container-workdir=/mnt --container-name=$CONTAINER_NAME"
69 | 
70 | # HCOLL is not used silence HCOLL warnings when running on a node without a IB HCA 
71 | export OMPI_MCA_coll_hcoll_enable=0
72 | 
73 | export MPIRUN_ARGS="--oversubscribe"
74 | 
75 | #rebuild executables
76 | srun $SRUN_ARGS -n 1 /bin/bash -c "./test.sh clean; sleep 1; ./test.sh"
77 | 
78 | srun -n 1 /bin/bash -c "sudo nvidia-smi -lgc 1980,1980"
79 | 
80 | srun $SRUN_ARGS -n 1 ./bench.sh
81 | 
82 | srun $SRUN_ARGS -n 1 /bin/bash -c "nvidia-smi; modinfo gdrdrv; env; nvcc --version; mpicxx --version"
83 | 
84 | 


--------------------------------------------------------------------------------
/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | NREP=5
 29 | NXNY="20480"
 30 | 
 31 | #DGX-1V
 32 | #CPUID=0-19
 33 | #FIRST_CORE=0
 34 | #MAX_NUM_GPUS=8
 35 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,3" "0,3,2" "0,3,2,1" "3,2,1,5,7" "0,3,2,1,5,4" "0,4,7,6,5,1,2" "0,3,2,1,5,6,7,4" )
 36 | #MPI_CPU_BINDING_OPT=("--bind-to" "core" "--map-by" "core")
 37 | 
 38 | #DGX-2
 39 | #CPUID=0-23
 40 | #FIRST_CORE=0
 41 | #MAX_NUM_GPUS=16
 42 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" "0,1,2,3,4,5,6,7,8" "0,1,2,3,4,5,6,7,8,9" "0,1,2,3,4,5,6,7,8,9,10" "0,1,2,3,4,5,6,7,8,9,10,11" "0,1,2,3,4,5,6,7,8,9,10,11,12" "0,1,2,3,4,5,6,7,8,9,10,11,12,13" "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14" "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15" )
 43 | #MPI_CPU_BINDING_OPT=("--bind-to" "core" "--map-by" "core")
 44 | 
 45 | #DGX-A100
 46 | #CPUID=48-63
 47 | #FIRST_CORE=48
 48 | #MAX_NUM_GPUS=8
 49 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 50 | #MPI_CPU_BINDING_OPT=("--bind-to" "cpu-list:ordered" "--cpu-list" "48,49,50,51,52,53,54,55")
 51 | 
 52 | #DGX-H100
 53 | CPUID=0-55
 54 | FIRST_CORE=0
 55 | MAX_NUM_GPUS=8
 56 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 57 | MPI_CPU_BINDING_OPT=("--bind-to" "core" "--map-by" "core")
 58 | 
 59 | IFS=$'\n'
 60 | function find_best () {
 61 |     declare -a RESULTS
 62 |     for ((i=0; i<$NREP; i++)); do
 63 |         RESULTS+=($("$@"))
 64 |     done
 65 |     printf '%s\n' "${RESULTS[@]}" | sort -k8 -b -t',' | head -1
 66 |     unset RESULTS
 67 | }
 68 | 
 69 | #Single GPU
 70 | if true; then
 71 |     echo "type, nx, ny, iter_max, nccheck, runtime"
 72 |     export CUDA_VISIBLE_DEVICES="0"
 73 |     for (( nx=1024; nx <= 20*1024; nx+=1024 )); do
 74 |         find_best taskset -c ${CPUID} ./single_gpu/jacobi -csv -nx $nx -ny $nx
 75 |     done
 76 | fi
 77 | 
 78 | if false; then
 79 |     echo "type, nx, ny, iter_max, nccheck, runtime"
 80 |     export CUDA_VISIBLE_DEVICES="0"
 81 |     find_best taskset -c ${CPUID} ./single_gpu/jacobi -csv -nx ${NXNY} -ny ${NXNY}
 82 | fi
 83 | 
 84 | echo "type, nx, ny, iter_max, nccheck, num_devices, p2p, runtime, runtime_serial"
 85 | 
 86 | #Single threaded copy - no P2P
 87 | if false; then
 88 | 
 89 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
 90 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 91 |         find_best taskset -c ${CPUID} ./single_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY} -nop2p
 92 |     done
 93 | 
 94 | fi
 95 | 
 96 | # Single threaded copy - P2P
 97 | if false; then
 98 | 
 99 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
100 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
101 |         find_best taskset -c ${CPUID} ./single_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY}
102 |     done
103 | 
104 | fi
105 | 
106 | #multi threaded copy without thread pinning
107 | if false; then
108 | 
109 |     export OMP_PROC_BIND=FALSE
110 |     unset OMP_PLACES
111 |     
112 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
113 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
114 |         find_best ./multi_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY}
115 |     done
116 | 
117 | fi
118 | 
119 | export OMP_PROC_BIND=TRUE
120 | 
121 | #multi threaded copy
122 | if false; then
123 | 
124 |     NEXT_CORE=${FIRST_CORE}
125 |     OMP_PLACES="{$((NEXT_CORE))}"
126 |     NEXT_CORE=$((NEXT_CORE+1))
127 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
128 |         if (( NUM_GPUS > 1 )); then
129 |             OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}"
130 |             NEXT_CORE=$((NEXT_CORE+1))
131 |         fi
132 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
133 |         export OMP_PLACES
134 |         find_best ./multi_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY}
135 |     done
136 | 
137 | fi
138 | 
139 | #multi threaded copy overlap
140 | if false; then
141 | 
142 |     NEXT_CORE=${FIRST_CORE}
143 |     OMP_PLACES="{$((NEXT_CORE))}"
144 |     NEXT_CORE=$((NEXT_CORE+1))
145 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
146 |         if (( NUM_GPUS > 1 )); then
147 |             OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}"
148 |             NEXT_CORE=$((NEXT_CORE+1))
149 |         fi
150 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
151 |         export OMP_PLACES
152 |         find_best ./multi_threaded_copy_overlap/jacobi -csv -nx ${NXNY} -ny ${NXNY}
153 |     done
154 | 
155 | fi
156 | 
157 | #multi threaded p2p
158 | if false; then
159 | 
160 |     NEXT_CORE=${FIRST_CORE}
161 |     OMP_PLACES="{$((NEXT_CORE))}"
162 |     NEXT_CORE=$((NEXT_CORE+1))
163 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
164 |         if (( NUM_GPUS > 1 )); then
165 |             OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}"
166 |             NEXT_CORE=$((NEXT_CORE+1))
167 |         fi
168 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
169 |         export OMP_PLACES
170 |         find_best ./multi_threaded_p2p/jacobi -csv -nx ${NXNY} -ny ${NXNY}
171 |     done
172 | 
173 | fi
174 | 
175 | #multi threaded p2p with delayed check
176 | if false; then
177 | 
178 |     NEXT_CORE=${FIRST_CORE}
179 |     OMP_PLACES="{$((NEXT_CORE))}"
180 |     NEXT_CORE=$((NEXT_CORE+1))
181 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
182 |         if (( NUM_GPUS > 1 )); then
183 |             OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}"
184 |             NEXT_CORE=$((NEXT_CORE+1))
185 |         fi
186 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
187 |         export OMP_PLACES
188 |         find_best ./multi_threaded_p2p_opt/jacobi -csv -nx ${NXNY} -ny ${NXNY}
189 |     done
190 | 
191 | fi
192 | 
193 | if true; then
194 | 
195 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
196 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
197 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./mpi/jacobi -csv -nx ${NXNY} -ny ${NXNY}
198 |     done
199 | 
200 | fi
201 | 
202 | if true; then
203 | 
204 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
205 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
206 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./mpi_overlap/jacobi -csv -nx ${NXNY} -ny ${NXNY}
207 |     done
208 | 
209 | fi
210 | 
211 | if true; then
212 | 
213 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
214 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
215 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./nccl/jacobi -csv -nx ${NXNY} -ny ${NXNY}
216 |     done
217 | 
218 | fi
219 | 
220 | if true; then
221 | 
222 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
223 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
224 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./nccl_overlap/jacobi -csv -nx ${NXNY} -ny ${NXNY}
225 |     done
226 | 
227 | fi
228 | 
229 | if true; then
230 | 
231 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
232 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
233 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./nccl_graphs/jacobi -csv -nx ${NXNY} -ny ${NXNY}
234 |     done
235 | 
236 | fi
237 | 
238 | if true; then
239 | 
240 |     export NVSHMEM_SYMMETRIC_SIZE=3690987520
241 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
242 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
243 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES -x NVSHMEM_SYMMETRIC_SIZE "${MPI_CPU_BINDING_OPT[@]}" ./nvshmem/jacobi -csv -nx ${NXNY} -ny ${NXNY}
244 |     done
245 | 
246 |     export NVSHMEM_SYMMETRIC_SIZE=3690987520
247 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
248 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
249 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES -x NVSHMEM_SYMMETRIC_SIZE "${MPI_CPU_BINDING_OPT[@]}" ./nvshmem/jacobi -csv -neighborhood_sync -nx ${NXNY} -ny ${NXNY}
250 |     done
251 | 
252 |     export NVSHMEM_SYMMETRIC_SIZE=3690987520
253 |     for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
254 |         export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
255 |         find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES -x NVSHMEM_SYMMETRIC_SIZE "${MPI_CPU_BINDING_OPT[@]}" ./nvshmem/jacobi -csv -neighborhood_sync -norm_overlap -nx ${NXNY} -ny ${NXNY}
256 |     done
257 | 
258 | fi
259 | 


--------------------------------------------------------------------------------
/mpi/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPICXX=mpicxx
 5 | MPIRUN ?= mpirun
 6 | CUDA_HOME ?= /usr/local/cuda
 7 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 8 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 9 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
10 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
11 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
12 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
13 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
14 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
15 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
16 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
17 | ifdef DISABLE_CUB
18 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
19 | else
20 |         NVCC_FLAGS = -DHAVE_CUB
21 | endif
22 | ifdef SKIP_CUDA_AWARENESS_CHECK
23 |         MPICXX_FLAGS = -DSKIP_CUDA_AWARENESS_CHECK
24 | endif
25 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
26 | MPICXX_FLAGS += -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14
27 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -ldl
28 | jacobi: Makefile jacobi.cpp jacobi_kernels.o
29 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
30 | 
31 | jacobi_kernels.o: Makefile jacobi_kernels.cu
32 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
33 | 
34 | .PHONY.: clean
35 | clean:
36 | 	rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log
37 | 
38 | sanitize: jacobi
39 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
40 | 
41 | run: jacobi
42 | 	$(MPIRUN) -np $(NP) ./jacobi
43 | 
44 | profile: jacobi
45 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
46 | 


--------------------------------------------------------------------------------
/mpi/jacobi.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <cmath>
 29 | #include <cstdio>
 30 | #include <iostream>
 31 | #include <sstream>
 32 | #include <cstdlib>
 33 | 
 34 | #include <mpi.h>
 35 | 
 36 | #ifndef SKIP_CUDA_AWARENESS_CHECK
 37 | #include <mpi-ext.h>
 38 | #if !defined(MPIX_CUDA_AWARE_SUPPORT) || !MPIX_CUDA_AWARE_SUPPORT
 39 | #error "The used MPI Implementation does not have CUDA-aware support or CUDA-aware \
 40 | support can't be determined. Define SKIP_CUDA_AWARENESS_CHECK to skip this check."
 41 | #endif
 42 | #endif
 43 | 
 44 | #define MPI_CALL(call)                                                                \
 45 |     {                                                                                 \
 46 |         int mpi_status = call;                                                        \
 47 |         if (MPI_SUCCESS != mpi_status) {                                              \
 48 |             char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
 49 |             int mpi_error_string_length = 0;                                          \
 50 |             MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
 51 |             if (NULL != mpi_error_string)                                             \
 52 |                 fprintf(stderr,                                                       \
 53 |                         "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
 54 |                         "with %s "                                                    \
 55 |                         "(%d).\n",                                                    \
 56 |                         #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
 57 |             else                                                                      \
 58 |                 fprintf(stderr,                                                       \
 59 |                         "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
 60 |                         "with %d.\n",                                                 \
 61 |                         #call, __LINE__, __FILE__, mpi_status);                       \
 62 |             exit( mpi_status );                                                       \
 63 |         }                                                                             \
 64 |     }
 65 | 
 66 | #include <cuda_runtime.h>
 67 | 
 68 | #ifdef USE_NVTX
 69 | #include <nvtx3/nvToolsExt.h>
 70 | 
 71 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
 72 |                            0x0000ffff, 0x00ff0000, 0x00ffffff};
 73 | const int num_colors = sizeof(colors) / sizeof(uint32_t);
 74 | 
 75 | #define PUSH_RANGE(name, cid)                              \
 76 |     {                                                      \
 77 |         int color_id = cid;                                \
 78 |         color_id = color_id % num_colors;                  \
 79 |         nvtxEventAttributes_t eventAttrib = {0};           \
 80 |         eventAttrib.version = NVTX_VERSION;                \
 81 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
 82 |         eventAttrib.colorType = NVTX_COLOR_ARGB;           \
 83 |         eventAttrib.color = colors[color_id];              \
 84 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
 85 |         eventAttrib.message.ascii = name;                  \
 86 |         nvtxRangePushEx(&eventAttrib);                     \
 87 |     }
 88 | #define POP_RANGE nvtxRangePop();
 89 | #else
 90 | #define PUSH_RANGE(name, cid)
 91 | #define POP_RANGE
 92 | #endif
 93 | 
 94 | #define CUDA_RT_CALL(call)                                                                  \
 95 |     {                                                                                       \
 96 |         cudaError_t cudaStatus = call;                                                      \
 97 |         if (cudaSuccess != cudaStatus) {                                                    \
 98 |             fprintf(stderr,                                                                 \
 99 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
100 |                     "with "                                                                 \
101 |                     "%s (%d).\n",                                                           \
102 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
103 |             exit( cudaStatus );                                                             \
104 |         }                                                                                   \
105 |     }
106 | 
107 | #ifdef USE_DOUBLE
108 | typedef double real;
109 | #define MPI_REAL_TYPE MPI_DOUBLE
110 | #else
111 | typedef float real;
112 | #define MPI_REAL_TYPE MPI_FLOAT
113 | #endif
114 | 
115 | constexpr real tol = 1.0e-8;
116 | 
117 | const real PI = 2.0 * std::asin(1.0);
118 | 
119 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
120 |                                   const real pi, const int offset, const int nx, const int my_ny,
121 |                                   const int ny);
122 | 
123 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
124 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
125 |                           const int nx, const bool calculate_norm, cudaStream_t stream);
126 | 
127 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
128 |                   const int nccheck, const bool print);
129 | 
130 | template <typename T>
131 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
132 |     T argval = default_val;
133 |     char** itr = std::find(begin, end, arg);
134 |     if (itr != end && ++itr != end) {
135 |         std::istringstream inbuf(*itr);
136 |         inbuf >> argval;
137 |     }
138 |     return argval;
139 | }
140 | 
141 | bool get_arg(char** begin, char** end, const std::string& arg) {
142 |     char** itr = std::find(begin, end, arg);
143 |     if (itr != end) {
144 |         return true;
145 |     }
146 |     return false;
147 | }
148 | 
149 | int main(int argc, char* argv[]) {
150 |     MPI_CALL(MPI_Init(&argc, &argv));
151 | #if !defined(SKIP_CUDA_AWARENESS_CHECK) && defined(MPIX_CUDA_AWARE_SUPPORT)
152 |     if (1 != MPIX_Query_cuda_support()) {
153 |         fprintf(stderr, "The used MPI Implementation does not have CUDA-aware support enabled!\n");
154 |         MPI_CALL(MPI_Finalize());
155 |         return -1;
156 |     }
157 | #endif
158 |     int rank;
159 |     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
160 |     int size;
161 |     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
162 |     int num_devices = 0;
163 |     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
164 | 
165 |     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
166 |     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
167 |     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
168 |     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
169 |     const bool csv = get_arg(argv, argv + argc, "-csv");
170 | 
171 |     int local_rank = -1;
172 |     {
173 |         MPI_Comm local_comm;
174 |         MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
175 |                                      &local_comm));
176 | 
177 |         MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
178 | 
179 |         MPI_CALL(MPI_Comm_free(&local_comm));
180 |     }
181 | 
182 |     CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices));
183 |     CUDA_RT_CALL(cudaFree(0));
184 | 
185 |     real* a_ref_h;
186 |     CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
187 |     real* a_h;
188 |     CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
189 |     double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank));
190 | 
191 |     // ny - 2 rows are distributed amongst `size` ranks in such a way
192 |     // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
193 |     // This optimizes load balancing when (ny - 2) % size != 0
194 |     int chunk_size;
195 |     int chunk_size_low = (ny - 2) / size;
196 |     int chunk_size_high = chunk_size_low + 1;
197 |     // To calculate the number of ranks that need to compute an extra row,
198 |     // the following formula is derived from this equation:
199 |     // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
200 |     int num_ranks_low = size * chunk_size_low + size -
201 |                         (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
202 |     if (rank < num_ranks_low)
203 |         chunk_size = chunk_size_low;
204 |     else
205 |         chunk_size = chunk_size_high;
206 | 
207 |     real* a;
208 |     CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
209 |     real* a_new;
210 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
211 | 
212 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
213 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
214 | 
215 |     // Calculate local domain boundaries
216 |     int iy_start_global;  // My start index in the global array
217 |     if (rank < num_ranks_low) {
218 |         iy_start_global = rank * chunk_size_low + 1;
219 |     } else {
220 |         iy_start_global =
221 |             num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
222 |     }
223 |     int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
224 | 
225 |     int iy_start = 1;
226 |     int iy_end = iy_start + chunk_size;
227 | 
228 |     // Set diriclet boundary conditions on left and right boarder
229 |     launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
230 |     CUDA_RT_CALL(cudaDeviceSynchronize());
231 | 
232 |     cudaStream_t compute_stream;
233 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
234 |     cudaEvent_t compute_done;
235 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
236 | 
237 |     real* l2_norm_d;
238 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
239 |     real* l2_norm_h;
240 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
241 | 
242 |     PUSH_RANGE("MPI_Warmup", 5)
243 |     for (int i = 0; i < 10; ++i) {
244 |         const int top = rank > 0 ? rank - 1 : (size - 1);
245 |         const int bottom = (rank + 1) % size;
246 |         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
247 |                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
248 |                               MPI_STATUS_IGNORE));
249 |         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
250 |                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
251 |         std::swap(a_new, a);
252 |     }
253 |     POP_RANGE
254 | 
255 |     CUDA_RT_CALL(cudaDeviceSynchronize());
256 | 
257 |     if (!csv && 0 == rank) {
258 |         printf(
259 |             "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
260 |             "every %d iterations\n",
261 |             iter_max, ny, nx, nccheck);
262 |     }
263 | 
264 |     int iter = 0;
265 |     real l2_norm = 1.0;
266 |     bool calculate_norm = true; // boolean to store whether l2 norm will be calculated in
267 |                                 // an iteration or not
268 | 
269 |     MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
270 |     double start = MPI_Wtime();
271 |     PUSH_RANGE("Jacobi solve", 0)
272 |     while (l2_norm > tol && iter < iter_max) {
273 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
274 | 
275 |         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
276 | 
277 |         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
278 |                              compute_stream);
279 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
280 | 
281 |         if (calculate_norm) {
282 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
283 |                                          compute_stream));
284 |         }
285 | 
286 |         const int top = rank > 0 ? rank - 1 : (size - 1);
287 |         const int bottom = (rank + 1) % size;
288 | 
289 |         // Apply periodic boundary conditions
290 |         CUDA_RT_CALL(cudaEventSynchronize(compute_done));
291 |         PUSH_RANGE("MPI", 5)
292 |         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
293 |                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
294 |                               MPI_STATUS_IGNORE));
295 |         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
296 |                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
297 |         POP_RANGE
298 | 
299 |         if (calculate_norm) {
300 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
301 |             MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD));
302 |             l2_norm = std::sqrt(l2_norm);
303 | 
304 |             if (!csv && 0 == rank && (iter % 100) == 0) {
305 |                 printf("%5d, %0.6f\n", iter, l2_norm);
306 |             }
307 |         }
308 | 
309 |         std::swap(a_new, a);
310 |         iter++;
311 |     }
312 |     double stop = MPI_Wtime();
313 |     POP_RANGE
314 | 
315 |     CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
316 |                             std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real),
317 |                             cudaMemcpyDeviceToHost));
318 | 
319 |     int result_correct = 1;
320 |     for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
321 |         for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
322 |             if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
323 |                 fprintf(stderr,
324 |                         "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
325 |                         "(reference)\n",
326 |                         rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
327 |                 result_correct = 0;
328 |             }
329 |         }
330 |     }
331 | 
332 |     int global_result_correct = 1;
333 |     MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
334 |                            MPI_COMM_WORLD));
335 |     result_correct = global_result_correct;
336 | 
337 |     if (rank == 0 && result_correct) {
338 |         if (csv) {
339 |             printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
340 |                    (stop - start), runtime_serial);
341 |         } else {
342 |             printf("Num GPUs: %d.\n", size);
343 |             printf(
344 |                 "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
345 |                 "efficiency: %8.2f \n",
346 |                 ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
347 |                 runtime_serial / (size * (stop - start)) * 100);
348 |         }
349 |     }
350 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
351 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
352 | 
353 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
354 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
355 | 
356 |     CUDA_RT_CALL(cudaFree(a_new));
357 |     CUDA_RT_CALL(cudaFree(a));
358 | 
359 |     CUDA_RT_CALL(cudaFreeHost(a_h));
360 |     CUDA_RT_CALL(cudaFreeHost(a_ref_h));
361 | 
362 |     MPI_CALL(MPI_Finalize());
363 |     return (result_correct == 1) ? 0 : 1;
364 | }
365 | 
366 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
367 |                   const int nccheck, const bool print) {
368 |     real* a;
369 |     real* a_new;
370 | 
371 |     cudaStream_t compute_stream;
372 |     cudaStream_t push_top_stream;
373 |     cudaStream_t push_bottom_stream;
374 |     cudaEvent_t compute_done;
375 |     cudaEvent_t push_top_done;
376 |     cudaEvent_t push_bottom_done;
377 | 
378 |     real* l2_norm_d;
379 |     real* l2_norm_h;
380 | 
381 |     int iy_start = 1;
382 |     int iy_end = (ny - 1);
383 | 
384 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
385 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
386 | 
387 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
388 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
389 | 
390 |     // Set diriclet boundary conditions on left and right boarder
391 |     launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
392 |     CUDA_RT_CALL(cudaDeviceSynchronize());
393 | 
394 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
395 |     CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
396 |     CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
397 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
398 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
399 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
400 | 
401 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
402 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
403 | 
404 |     CUDA_RT_CALL(cudaDeviceSynchronize());
405 | 
406 |     if (print)
407 |         printf(
408 |             "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
409 |             "norm "
410 |             "check every %d iterations\n",
411 |             iter_max, ny, nx, nccheck);
412 | 
413 |     int iter = 0;
414 |     real l2_norm = 1.0;
415 |     bool calculate_norm = true;
416 | 
417 |     double start = MPI_Wtime();
418 |     PUSH_RANGE("Jacobi solve", 0)
419 |     while (l2_norm > tol && iter < iter_max) {
420 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
421 | 
422 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
423 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
424 | 
425 |         calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0;
426 |         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
427 |                              compute_stream);
428 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
429 | 
430 |         if (calculate_norm) {
431 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
432 |                                          compute_stream));
433 |         }
434 | 
435 |         // Apply periodic boundary conditions
436 | 
437 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
438 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
439 |                                      cudaMemcpyDeviceToDevice, push_top_stream));
440 |         CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
441 | 
442 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
443 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
444 |                                      cudaMemcpyDeviceToDevice, compute_stream));
445 |         CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
446 | 
447 |         if (calculate_norm) {
448 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
449 |             l2_norm = *l2_norm_h;
450 |             l2_norm = std::sqrt(l2_norm);
451 |             if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
452 |         }
453 | 
454 |         std::swap(a_new, a);
455 |         iter++;
456 |     }
457 |     POP_RANGE
458 |     double stop = MPI_Wtime();
459 | 
460 |     CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
461 | 
462 |     CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
463 |     CUDA_RT_CALL(cudaEventDestroy(push_top_done));
464 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
465 |     CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
466 |     CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
467 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
468 | 
469 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
470 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
471 | 
472 |     CUDA_RT_CALL(cudaFree(a_new));
473 |     CUDA_RT_CALL(cudaFree(a));
474 |     return (stop - start);
475 | }
476 | 


--------------------------------------------------------------------------------
/mpi/jacobi_kernels.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <cstdio>
 28 | #include <cstdlib>
 29 | 
 30 | #ifdef HAVE_CUB
 31 | #include <cub/block/block_reduce.cuh>
 32 | #endif  // HAVE_CUB
 33 | 
 34 | #define CUDA_RT_CALL(call)                                                                  \
 35 |     {                                                                                       \
 36 |         cudaError_t cudaStatus = call;                                                      \
 37 |         if (cudaSuccess != cudaStatus) {                                                    \
 38 |             fprintf(stderr,                                                                 \
 39 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 40 |                     "with "                                                                 \
 41 |                     "%s (%d).\n",                                                           \
 42 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 43 |             exit( cudaStatus );                                                             \
 44 |         }                                                                                   \
 45 |     }
 46 | 
 47 | #ifdef USE_DOUBLE
 48 | typedef double real;
 49 | #define MPI_REAL_TYPE MPI_DOUBLE
 50 | #else
 51 | typedef float real;
 52 | #define MPI_REAL_TYPE MPI_FLOAT
 53 | #endif
 54 | 
 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 56 |                                       const real pi, const int offset, const int nx,
 57 |                                       const int my_ny, const int ny) {
 58 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 59 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 60 |         a[iy * nx + 0] = y0;
 61 |         a[iy * nx + (nx - 1)] = y0;
 62 |         a_new[iy * nx + 0] = y0;
 63 |         a_new[iy * nx + (nx - 1)] = y0;
 64 |     }
 65 | }
 66 | 
 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 68 |                                   const real pi, const int offset, const int nx, const int my_ny,
 69 |                                   const int ny) {
 70 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
 71 |     CUDA_RT_CALL(cudaGetLastError());
 72 | }
 73 | 
 74 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 76 |                               real* __restrict__ const l2_norm, const int iy_start,
 77 |                               const int iy_end, const int nx, const bool calculate_norm) {
 78 | #ifdef HAVE_CUB
 79 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
 80 |         BlockReduce;
 81 |     __shared__ typename BlockReduce::TempStorage temp_storage;
 82 | #endif  // HAVE_CUB
 83 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 84 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 85 |     real local_l2_norm = 0.0;
 86 | 
 87 |     if (iy < iy_end && ix < (nx - 1)) {
 88 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 89 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 90 |         a_new[iy * nx + ix] = new_val;
 91 |         if (calculate_norm) {
 92 |             real residue = new_val - a[iy * nx + ix];
 93 |             local_l2_norm += residue * residue;
 94 |         }
 95 |     }
 96 |     if (calculate_norm) {
 97 | #ifdef HAVE_CUB
 98 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
 99 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
100 | #else
101 |         atomicAdd(l2_norm, local_l2_norm);
102 | #endif  // HAVE_CUB
103 |     }
104 | }
105 | 
106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
107 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
108 |                           const int nx, const bool calculate_norm, cudaStream_t stream) {
109 |     constexpr int dim_block_x = 32;
110 |     constexpr int dim_block_y = 32;
111 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
112 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
113 |     jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
114 |         a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
115 |     CUDA_RT_CALL(cudaGetLastError());
116 | }
117 | 


--------------------------------------------------------------------------------
/mpi_overlap/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPICXX=mpicxx
 5 | MPIRUN ?= mpirun
 6 | CUDA_HOME ?= /usr/local/cuda
 7 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 8 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 9 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
10 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
11 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
12 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
13 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
14 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
15 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
16 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
17 | ifdef DISABLE_CUB
18 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
19 | else
20 |         NVCC_FLAGS = -DHAVE_CUB
21 | endif
22 | ifdef SKIP_CUDA_AWARENESS_CHECK
23 |         MPICXX_FLAGS = -DSKIP_CUDA_AWARENESS_CHECK
24 | endif
25 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
26 | MPICXX_FLAGS += -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14
27 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -ldl
28 | jacobi: Makefile jacobi.cpp jacobi_kernels.o
29 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
30 | 
31 | jacobi_kernels.o: Makefile jacobi_kernels.cu
32 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
33 | 
34 | .PHONY.: clean
35 | clean:
36 | 	rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log
37 | 
38 | sanitize: jacobi
39 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
40 | 
41 | run: jacobi
42 | 	$(MPIRUN) -np $(NP) ./jacobi
43 | 
44 | profile: jacobi
45 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
46 | 


--------------------------------------------------------------------------------
/mpi_overlap/jacobi.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <cmath>
 29 | #include <cstdio>
 30 | #include <iostream>
 31 | #include <sstream>
 32 | #include <cstdlib>
 33 | 
 34 | #include <mpi.h>
 35 | 
 36 | #ifndef SKIP_CUDA_AWARENESS_CHECK
 37 | #include <mpi-ext.h>
 38 | #if !defined(MPIX_CUDA_AWARE_SUPPORT) || !MPIX_CUDA_AWARE_SUPPORT
 39 | #error "The used MPI Implementation does not have CUDA-aware support or CUDA-aware \
 40 | support can't be determined. Define SKIP_CUDA_AWARENESS_CHECK to skip this check."
 41 | #endif
 42 | #endif
 43 | 
 44 | #define MPI_CALL(call)                                                                \
 45 |     {                                                                                 \
 46 |         int mpi_status = call;                                                        \
 47 |         if (MPI_SUCCESS != mpi_status) {                                              \
 48 |             char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
 49 |             int mpi_error_string_length = 0;                                          \
 50 |             MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
 51 |             if (NULL != mpi_error_string)                                             \
 52 |                 fprintf(stderr,                                                       \
 53 |                         "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
 54 |                         "with %s "                                                    \
 55 |                         "(%d).\n",                                                    \
 56 |                         #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
 57 |             else                                                                      \
 58 |                 fprintf(stderr,                                                       \
 59 |                         "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
 60 |                         "with %d.\n",                                                 \
 61 |                         #call, __LINE__, __FILE__, mpi_status);                       \
 62 |             exit( mpi_status );                                                       \
 63 |         }                                                                             \
 64 |     }
 65 | 
 66 | #include <cuda_runtime.h>
 67 | 
 68 | #ifdef USE_NVTX
 69 | #include <nvtx3/nvToolsExt.h>
 70 | 
 71 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
 72 |                            0x0000ffff, 0x00ff0000, 0x00ffffff};
 73 | const int num_colors = sizeof(colors) / sizeof(uint32_t);
 74 | 
 75 | #define PUSH_RANGE(name, cid)                              \
 76 |     {                                                      \
 77 |         int color_id = cid;                                \
 78 |         color_id = color_id % num_colors;                  \
 79 |         nvtxEventAttributes_t eventAttrib = {0};           \
 80 |         eventAttrib.version = NVTX_VERSION;                \
 81 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
 82 |         eventAttrib.colorType = NVTX_COLOR_ARGB;           \
 83 |         eventAttrib.color = colors[color_id];              \
 84 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
 85 |         eventAttrib.message.ascii = name;                  \
 86 |         nvtxRangePushEx(&eventAttrib);                     \
 87 |     }
 88 | #define POP_RANGE nvtxRangePop();
 89 | #else
 90 | #define PUSH_RANGE(name, cid)
 91 | #define POP_RANGE
 92 | #endif
 93 | 
 94 | #define CUDA_RT_CALL(call)                                                                  \
 95 |     {                                                                                       \
 96 |         cudaError_t cudaStatus = call;                                                      \
 97 |         if (cudaSuccess != cudaStatus) {                                                    \
 98 |             fprintf(stderr,                                                                 \
 99 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
100 |                     "with "                                                                 \
101 |                     "%s (%d).\n",                                                           \
102 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
103 |             exit( cudaStatus );                                                             \
104 |         }                                                                                   \
105 |     }
106 | 
107 | #ifdef USE_DOUBLE
108 | typedef double real;
109 | #define MPI_REAL_TYPE MPI_DOUBLE
110 | #else
111 | typedef float real;
112 | #define MPI_REAL_TYPE MPI_FLOAT
113 | #endif
114 | 
115 | constexpr real tol = 1.0e-8;
116 | 
117 | const real PI = 2.0 * std::asin(1.0);
118 | 
119 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
120 |                                   const real pi, const int offset, const int nx, const int my_ny,
121 |                                   const int ny);
122 | 
123 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
124 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
125 |                           const int nx, const bool calculate_norm, cudaStream_t stream);
126 | 
127 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
128 |                   const int nccheck, const bool print);
129 | 
130 | template <typename T>
131 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
132 |     T argval = default_val;
133 |     char** itr = std::find(begin, end, arg);
134 |     if (itr != end && ++itr != end) {
135 |         std::istringstream inbuf(*itr);
136 |         inbuf >> argval;
137 |     }
138 |     return argval;
139 | }
140 | 
141 | bool get_arg(char** begin, char** end, const std::string& arg) {
142 |     char** itr = std::find(begin, end, arg);
143 |     if (itr != end) {
144 |         return true;
145 |     }
146 |     return false;
147 | }
148 | 
149 | int main(int argc, char* argv[]) {
150 |     MPI_CALL(MPI_Init(&argc, &argv));
151 | #if !defined(SKIP_CUDA_AWARENESS_CHECK) && defined(MPIX_CUDA_AWARE_SUPPORT)
152 |     if (1 != MPIX_Query_cuda_support()) {
153 |         fprintf(stderr, "The used MPI Implementation does not have CUDA-aware support enabled!\n");
154 |         MPI_CALL(MPI_Finalize());
155 |         return -1;
156 |     }
157 | #endif
158 |     int rank;
159 |     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
160 |     int size;
161 |     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
162 |     int num_devices = 0;
163 |     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
164 | 
165 |     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
166 |     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
167 |     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
168 |     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
169 |     const bool csv = get_arg(argv, argv + argc, "-csv");
170 |     const bool use_hp_streams = get_arg(argv, argv + argc, "-use_hp_streams");
171 | 
172 |     if ( nccheck > 1 && !use_hp_streams && 0 == rank )
173 |     {
174 |         fprintf(stderr,
175 |             "WARN: When not calculating the norm in every iteration kernels might be executed in "
176 |             "an order that breaks communication computation overlap. Also enable -use_hp_streams "
177 |             "to avoid this issue.\n");
178 |     }
179 | 
180 |     int local_rank = -1;
181 |     {
182 |         MPI_Comm local_comm;
183 |         MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
184 |                                      &local_comm));
185 | 
186 |         MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
187 | 
188 |         MPI_CALL(MPI_Comm_free(&local_comm));
189 |     }
190 | 
191 |     CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices));
192 |     CUDA_RT_CALL(cudaFree(0));
193 | 
194 |     real* a_ref_h;
195 |     CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
196 |     real* a_h;
197 |     CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
198 |     double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank));
199 | 
200 |     // ny - 2 rows are distributed amongst `size` ranks in such a way
201 |     // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
202 |     // This optimizes load balancing when (ny - 2) % size != 0
203 |     int chunk_size;
204 |     int chunk_size_low = (ny - 2) / size;
205 |     int chunk_size_high = chunk_size_low + 1;
206 |     // To calculate the number of ranks that need to compute an extra row,
207 |     // the following formula is derived from this equation:
208 |     // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
209 |     int num_ranks_low = size * chunk_size_low + size -
210 |                         (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
211 |     if (rank < num_ranks_low)
212 |         chunk_size = chunk_size_low;
213 |     else
214 |         chunk_size = chunk_size_high;
215 | 
216 |     real* a;
217 |     CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
218 |     real* a_new;
219 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
220 | 
221 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
222 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
223 | 
224 |     // Calculate local domain boundaries
225 |     int iy_start_global;  // My start index in the global array
226 |     if (rank < num_ranks_low) {
227 |         iy_start_global = rank * chunk_size_low + 1;
228 |     } else {
229 |         iy_start_global =
230 |             num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
231 |     }
232 |     int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
233 | 
234 |     int iy_start = 1;
235 |     int iy_end = iy_start + chunk_size;
236 | 
237 |     // Set diriclet boundary conditions on left and right boarder
238 |     launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
239 |     CUDA_RT_CALL(cudaDeviceSynchronize());
240 | 
241 |     int leastPriority = 0;
242 |     int greatestPriority = leastPriority;
243 |     CUDA_RT_CALL(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
244 |     cudaStream_t compute_stream;
245 |     cudaStream_t push_top_stream;
246 |     cudaStream_t push_bottom_stream;
247 |     if (use_hp_streams) {
248 |         CUDA_RT_CALL(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, leastPriority));
249 |         CUDA_RT_CALL(
250 |             cudaStreamCreateWithPriority(&push_top_stream, cudaStreamDefault, greatestPriority));
251 |         CUDA_RT_CALL(
252 |             cudaStreamCreateWithPriority(&push_bottom_stream, cudaStreamDefault, greatestPriority));
253 |     } else {
254 |         CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
255 |         CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
256 |         CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
257 |     }
258 |     
259 |     cudaEvent_t push_top_done;
260 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
261 |     cudaEvent_t push_bottom_done;
262 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
263 |     cudaEvent_t reset_l2norm_done;
264 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2norm_done, cudaEventDisableTiming));
265 | 
266 |     real* l2_norm_d;
267 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
268 |     real* l2_norm_h;
269 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
270 | 
271 |     PUSH_RANGE("MPI_Warmup", 5)
272 |     for (int i = 0; i < 10; ++i) {
273 |         const int top = rank > 0 ? rank - 1 : (size - 1);
274 |         const int bottom = (rank + 1) % size;
275 |         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
276 |                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
277 |                               MPI_STATUS_IGNORE));
278 |         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
279 |                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
280 |         std::swap(a_new, a);
281 |     }
282 |     POP_RANGE
283 | 
284 |     CUDA_RT_CALL(cudaDeviceSynchronize());
285 | 
286 |     if (!csv && 0 == rank) {
287 |         printf(
288 |             "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
289 |             "every %d iterations\n",
290 |             iter_max, ny, nx, nccheck);
291 |     }
292 | 
293 |     int iter = 0;
294 |     bool calculate_norm = true;
295 |     real l2_norm = 1.0;
296 | 
297 |     MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
298 |     double start = MPI_Wtime();
299 |     PUSH_RANGE("Jacobi solve", 0)
300 |     while (l2_norm > tol && iter < iter_max) {
301 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
302 |         CUDA_RT_CALL(cudaEventRecord(reset_l2norm_done, compute_stream));
303 | 
304 |         if (use_hp_streams) {
305 |             launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx,
306 |                                  calculate_norm, compute_stream);
307 |         }
308 | 
309 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, reset_l2norm_done, 0));
310 |         calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
311 |         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
312 |                              push_top_stream);
313 |         CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
314 | 
315 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, reset_l2norm_done, 0));
316 |         launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm,
317 |                              push_bottom_stream);
318 |         CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
319 | 
320 |         if (!use_hp_streams) {
321 |             launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx,
322 |                                  calculate_norm, compute_stream);
323 |         }
324 | 
325 |         if (calculate_norm) {
326 |             CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
327 |             CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
328 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
329 |                                          compute_stream));
330 |         }
331 | 
332 |         const int top = rank > 0 ? rank - 1 : (size - 1);
333 |         const int bottom = (rank + 1) % size;
334 | 
335 |         // Apply periodic boundary conditions
336 |         CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream));
337 |         PUSH_RANGE("MPI", 5)
338 |         MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
339 |                               a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
340 |                               MPI_STATUS_IGNORE));
341 |         CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream));
342 |         MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
343 |                               MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
344 |         POP_RANGE
345 | 
346 |         if (calculate_norm) {
347 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
348 |             MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD));
349 |             l2_norm = std::sqrt(l2_norm);
350 | 
351 |             if (!csv && 0 == rank && (iter % 100) == 0) {
352 |                 printf("%5d, %0.6f\n", iter, l2_norm);
353 |             }
354 |         }
355 | 
356 |         std::swap(a_new, a);
357 |         iter++;
358 |     }
359 |     double stop = MPI_Wtime();
360 |     POP_RANGE
361 | 
362 |     CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
363 |                             std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real),
364 |                             cudaMemcpyDeviceToHost));
365 | 
366 |     int result_correct = 1;
367 |     for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
368 |         for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
369 |             if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
370 |                 fprintf(stderr,
371 |                         "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
372 |                         "(reference)\n",
373 |                         rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
374 |                 result_correct = 0;
375 |             }
376 |         }
377 |     }
378 | 
379 |     int global_result_correct = 1;
380 |     MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
381 |                            MPI_COMM_WORLD));
382 |     result_correct = global_result_correct;
383 | 
384 |     if (rank == 0 && result_correct) {
385 |         if (csv) {
386 |             printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
387 |                    (stop - start), runtime_serial);
388 |         } else {
389 |             printf("Num GPUs: %d.\n", size);
390 |             printf(
391 |                 "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
392 |                 "efficiency: %8.2f \n",
393 |                 ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
394 |                 runtime_serial / (size * (stop - start)) * 100);
395 |         }
396 |     }
397 |     CUDA_RT_CALL(cudaEventDestroy(reset_l2norm_done));
398 |     CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
399 |     CUDA_RT_CALL(cudaEventDestroy(push_top_done));
400 |     CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
401 |     CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
402 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
403 | 
404 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
405 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
406 | 
407 |     CUDA_RT_CALL(cudaFree(a_new));
408 |     CUDA_RT_CALL(cudaFree(a));
409 | 
410 |     CUDA_RT_CALL(cudaFreeHost(a_h));
411 |     CUDA_RT_CALL(cudaFreeHost(a_ref_h));
412 | 
413 |     MPI_CALL(MPI_Finalize());
414 |     return (result_correct == 1) ? 0 : 1;
415 | }
416 | 
417 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
418 |                   const int nccheck, const bool print) {
419 |     real* a;
420 |     real* a_new;
421 | 
422 |     cudaStream_t compute_stream;
423 |     cudaStream_t push_top_stream;
424 |     cudaStream_t push_bottom_stream;
425 |     cudaEvent_t compute_done;
426 |     cudaEvent_t push_top_done;
427 |     cudaEvent_t push_bottom_done;
428 | 
429 |     real* l2_norm_d;
430 |     real* l2_norm_h;
431 | 
432 |     int iy_start = 1;
433 |     int iy_end = (ny - 1);
434 | 
435 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
436 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
437 | 
438 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
439 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
440 | 
441 |     // Set diriclet boundary conditions on left and right boarder
442 |     launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
443 |     CUDA_RT_CALL(cudaDeviceSynchronize());
444 | 
445 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
446 |     CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
447 |     CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
448 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
449 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
450 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
451 | 
452 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
453 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
454 | 
455 |     CUDA_RT_CALL(cudaDeviceSynchronize());
456 | 
457 |     if (print)
458 |         printf(
459 |             "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
460 |             "norm "
461 |             "check every %d iterations\n",
462 |             iter_max, ny, nx, nccheck);
463 | 
464 |     int iter = 0;
465 |     bool calculate_norm = true;
466 |     real l2_norm = 1.0;
467 | 
468 |     double start = MPI_Wtime();
469 |     PUSH_RANGE("Jacobi solve", 0)
470 |     while (l2_norm > tol && iter < iter_max) {
471 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
472 | 
473 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
474 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
475 | 
476 |         calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0;
477 |         launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
478 |                              compute_stream);
479 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
480 | 
481 |         if (calculate_norm) {
482 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
483 |                                          compute_stream));
484 |         }
485 | 
486 |         // Apply periodic boundary conditions
487 | 
488 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
489 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
490 |                                      cudaMemcpyDeviceToDevice, push_top_stream));
491 |         CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
492 | 
493 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
494 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
495 |                                      cudaMemcpyDeviceToDevice, compute_stream));
496 |         CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
497 | 
498 |         if (calculate_norm) {
499 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
500 |             l2_norm = *l2_norm_h;
501 |             l2_norm = std::sqrt(l2_norm);
502 |             if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
503 |         }
504 | 
505 |         std::swap(a_new, a);
506 |         iter++;
507 |     }
508 |     POP_RANGE
509 |     double stop = MPI_Wtime();
510 | 
511 |     CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
512 | 
513 |     CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
514 |     CUDA_RT_CALL(cudaEventDestroy(push_top_done));
515 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
516 |     CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
517 |     CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
518 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
519 | 
520 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
521 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
522 | 
523 |     CUDA_RT_CALL(cudaFree(a_new));
524 |     CUDA_RT_CALL(cudaFree(a));
525 |     return (stop - start);
526 | }
527 | 


--------------------------------------------------------------------------------
/mpi_overlap/jacobi_kernels.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <cstdio>
 28 | #include <cstdlib>
 29 | 
 30 | #ifdef HAVE_CUB
 31 | #include <cub/block/block_reduce.cuh>
 32 | #endif  // HAVE_CUB
 33 | 
 34 | #define CUDA_RT_CALL(call)                                                                  \
 35 |     {                                                                                       \
 36 |         cudaError_t cudaStatus = call;                                                      \
 37 |         if (cudaSuccess != cudaStatus) {                                                    \
 38 |             fprintf(stderr,                                                                 \
 39 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 40 |                     "with "                                                                 \
 41 |                     "%s (%d).\n",                                                           \
 42 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 43 |             exit( cudaStatus );                                                             \
 44 |         }                                                                                   \
 45 |     }
 46 | 
 47 | #ifdef USE_DOUBLE
 48 | typedef double real;
 49 | #define MPI_REAL_TYPE MPI_DOUBLE
 50 | #else
 51 | typedef float real;
 52 | #define MPI_REAL_TYPE MPI_FLOAT
 53 | #endif
 54 | 
 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 56 |                                       const real pi, const int offset, const int nx,
 57 |                                       const int my_ny, const int ny) {
 58 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 59 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 60 |         a[iy * nx + 0] = y0;
 61 |         a[iy * nx + (nx - 1)] = y0;
 62 |         a_new[iy * nx + 0] = y0;
 63 |         a_new[iy * nx + (nx - 1)] = y0;
 64 |     }
 65 | }
 66 | 
 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 68 |                                   const real pi, const int offset, const int nx, const int my_ny,
 69 |                                   const int ny) {
 70 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
 71 |     CUDA_RT_CALL(cudaGetLastError());
 72 | }
 73 | 
 74 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 76 |                               real* __restrict__ const l2_norm, const int iy_start,
 77 |                               const int iy_end, const int nx, const bool calculate_norm) {
 78 | #ifdef HAVE_CUB
 79 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
 80 |         BlockReduce;
 81 |     __shared__ typename BlockReduce::TempStorage temp_storage;
 82 | #endif  // HAVE_CUB
 83 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 84 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 85 |     real local_l2_norm = 0.0;
 86 | 
 87 |     if (iy < iy_end && ix < (nx - 1)) {
 88 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 89 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 90 |         a_new[iy * nx + ix] = new_val;
 91 |         if (calculate_norm) {
 92 |             real residue = new_val - a[iy * nx + ix];
 93 |             local_l2_norm += residue * residue;
 94 |         }
 95 |     }
 96 |     if (calculate_norm) {
 97 | #ifdef HAVE_CUB
 98 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
 99 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
100 | #else
101 |         atomicAdd(l2_norm, local_l2_norm);
102 | #endif  // HAVE_CUB
103 |     }
104 | }
105 | 
106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
107 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
108 |                           const int nx, const bool calculate_norm, cudaStream_t stream) {
109 |     constexpr int dim_block_x = 32;
110 |     constexpr int dim_block_y = 32;
111 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
112 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
113 |     jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
114 |         a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
115 |     CUDA_RT_CALL(cudaGetLastError());
116 | }
117 | 


--------------------------------------------------------------------------------
/multi_node_p2p/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPICXX=mpicxx
 5 | MPIRUN ?= mpirun
 6 | CUDA_HOME ?= /usr/local/cuda
 7 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 8 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 9 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
10 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
11 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
12 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
13 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
14 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
15 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
16 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
17 | ifdef DISABLE_CUB
18 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
19 | else
20 |         NVCC_FLAGS = -DHAVE_CUB
21 | endif
22 | ifdef SKIP_CUDA_AWARENESS_CHECK
23 |         MPICXX_FLAGS = -DSKIP_CUDA_AWARENESS_CHECK
24 | endif
25 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
26 | MPICXX_FLAGS += -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14
27 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lcuda -ldl
28 | jacobi: Makefile jacobi.cpp jacobi_kernels.o
29 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
30 | 
31 | jacobi_kernels.o: Makefile jacobi_kernels.cu
32 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
33 | 
34 | .PHONY.: clean
35 | clean:
36 | 	rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log
37 | 
38 | sanitize: jacobi
39 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
40 | 
41 | run: jacobi
42 | 	$(MPIRUN) -np $(NP) ./jacobi
43 | 
44 | profile: jacobi
45 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
46 | 


--------------------------------------------------------------------------------
/multi_node_p2p/jacobi_kernels.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, 2024, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <cassert>
 28 | #include <cmath>
 29 | #include <cstdio>
 30 | #include <cstdlib>
 31 | #include <cuda/atomic>
 32 | 
 33 | #ifdef HAVE_CUB
 34 | #include <cub/block/block_reduce.cuh>
 35 | #endif  // HAVE_CUB
 36 | 
 37 | #define CUDA_RT_CALL(call)                                                                  \
 38 |     {                                                                                       \
 39 |         cudaError_t cudaStatus = call;                                                      \
 40 |         if (cudaSuccess != cudaStatus) {                                                    \
 41 |             fprintf(stderr,                                                                 \
 42 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 43 |                     "with "                                                                 \
 44 |                     "%s (%d).\n",                                                           \
 45 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 46 |             exit(cudaStatus);                                                               \
 47 |         }                                                                                   \
 48 |     }
 49 | 
 50 | #ifdef USE_DOUBLE
 51 | typedef double real;
 52 | #define MPI_REAL_TYPE MPI_DOUBLE
 53 | #else
 54 | typedef float real;
 55 | #define MPI_REAL_TYPE MPI_FLOAT
 56 | #endif
 57 | 
 58 | struct real_int_pair {
 59 |     real value;
 60 |     unsigned int arrival_counter;
 61 | };
 62 | 
 63 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 64 |                                       const real pi, const int offset, const int nx,
 65 |                                       const int my_ny, const int ny) {
 66 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 67 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 68 |         a[iy * nx + 0] = y0;
 69 |         a[iy * nx + (nx - 1)] = y0;
 70 |         a_new[iy * nx + 0] = y0;
 71 |         a_new[iy * nx + (nx - 1)] = y0;
 72 |     }
 73 | }
 74 | 
 75 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 76 |                                   const real pi, const int offset, const int nx, const int my_ny,
 77 |                                   const int ny) {
 78 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
 79 |     CUDA_RT_CALL(cudaGetLastError());
 80 | }
 81 | 
 82 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 83 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 84 |                               real* __restrict__ const l2_norm, const int iy_start,
 85 |                               const int iy_end, const int nx, const bool calculate_norm) {
 86 | #ifdef HAVE_CUB
 87 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
 88 |         BlockReduce;
 89 |     __shared__ typename BlockReduce::TempStorage temp_storage;
 90 | #endif  // HAVE_CUB
 91 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 92 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 93 |     real local_l2_norm = 0.0;
 94 | 
 95 |     if (iy < iy_end && ix < (nx - 1)) {
 96 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 97 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 98 |         a_new[iy * nx + ix] = new_val;
 99 |         if (calculate_norm) {
100 |             real residue = new_val - a[iy * nx + ix];
101 |             local_l2_norm += residue * residue;
102 |         }
103 |     }
104 |     if (calculate_norm) {
105 | #ifdef HAVE_CUB
106 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
107 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
108 | #else
109 |         atomicAdd(l2_norm, local_l2_norm);
110 | #endif  // HAVE_CUB
111 |     }
112 | }
113 | 
114 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
115 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
116 |                           const int nx, const bool calculate_norm, cudaStream_t stream) {
117 |     constexpr int dim_block_x = 32;
118 |     constexpr int dim_block_y = 32;
119 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
120 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
121 |     jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
122 |         a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
123 |     CUDA_RT_CALL(cudaGetLastError());
124 | }
125 | 
126 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
127 | __global__ void jacobi_p2p_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
128 |                                   real* __restrict__ const l2_norm, const int iy_start,
129 |                                   const int iy_end, const int nx,
130 |                                   real* __restrict__ const a_new_top, const int top_iy,
131 |                                   real* __restrict__ const a_new_bottom, const int bottom_iy,
132 |                                   const bool calculate_norm) {
133 | #ifdef HAVE_CUB
134 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
135 |         BlockReduce;
136 |     __shared__ typename BlockReduce::TempStorage temp_storage;
137 | #endif  // HAVE_CUB
138 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
139 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
140 |     real local_l2_norm = 0.0;
141 | 
142 |     if (iy < iy_end && ix < (nx - 1)) {
143 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
144 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
145 |         a_new[iy * nx + ix] = new_val;
146 | 
147 |         if (iy_start == iy) {
148 |             a_new_top[top_iy * nx + ix] = new_val;
149 |         }
150 | 
151 |         if ((iy_end - 1) == iy) {
152 |             a_new_bottom[bottom_iy * nx + ix] = new_val;
153 |         }
154 | 
155 |         if (calculate_norm) {
156 |             real residue = new_val - a[iy * nx + ix];
157 |             local_l2_norm += residue * residue;
158 |         }
159 |     }
160 |     if (calculate_norm) {
161 | #ifdef HAVE_CUB
162 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
163 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
164 | #else
165 |         atomicAdd(l2_norm, local_l2_norm);
166 | #endif  // HAVE_CUB
167 |     }
168 | }
169 | 
170 | void launch_jacobi_p2p_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
171 |                               real* __restrict__ const l2_norm, const int iy_start,
172 |                               const int iy_end, const int nx, real* __restrict__ const a_new_top,
173 |                               const int top_iy, real* __restrict__ const a_new_bottom,
174 |                               const int bottom_iy, const bool calculate_norm, cudaStream_t stream) {
175 |     constexpr int dim_block_x = 32;
176 |     constexpr int dim_block_y = 32;
177 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
178 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
179 |     jacobi_p2p_kernel<dim_block_x, dim_block_y>
180 |         <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
181 |             a_new, a, l2_norm, iy_start, iy_end, nx, a_new_top, top_iy, a_new_bottom, bottom_iy,calculate_norm);
182 |     CUDA_RT_CALL(cudaGetLastError());
183 | }
184 | 
185 | __global__ void all_reduce_norm_barrier_kernel(real* const l2_norm,
186 |                                                real_int_pair* partial_l2_norm_uc,
187 |                                                real_int_pair* partial_l2_norm_mc,
188 |                                                const unsigned int expected_count) {
189 |     assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z);
190 |     real l2_norm_sum = 0.0;
191 | #if __CUDA_ARCH__ >= 900
192 |     // atomic reduction to all replicas
193 |     // this can be conceptually thought of as __threadfence_system(); atomicAdd_system(arrival_counter_mc, 1);
194 |     // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
195 |     // for multimem PTX doc
196 |     asm volatile ("multimem.red.release.sys.global.add.u32 [%0], %1;" ::"l"(&(partial_l2_norm_mc->arrival_counter)), "n"(1) : "memory");
197 | 
198 |     // Need a fence between MC and UC access to the same memory:
199 |     // - fence.proxy instructions establish an ordering between memory accesses that may happen through different proxies
200 |     // - Value .alias of the .proxykind qualifier refers to memory accesses performed using virtually aliased addresses to the same memory location.
201 |     // from https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar
202 |     asm volatile ("fence.proxy.alias;" ::: "memory");
203 | 
204 |     // spin wait with acquire ordering on UC mapping till all peers have arrived in this iteration
205 |     // Note: all ranks reach an MPI_Barrier after this kernel, such that it is not possible for the barrier to be unblocked by an
206 |     // arrival of a rank for the next iteration if some other rank is slow.
207 |     cuda::atomic_ref<unsigned int, cuda::thread_scope_system> ac(partial_l2_norm_uc->arrival_counter);
208 |     while (expected_count > ac.load(cuda::memory_order_acquire));
209 | 
210 |     // Atomic load reduction from all replicas. It does not provide ordering so it can be relaxed.
211 | #ifdef USE_DOUBLE
212 |     asm volatile ("multimem.ld_reduce.relaxed.sys.global.add.f64 %0, [%1];" : "=d"(l2_norm_sum) : "l"(&(partial_l2_norm_mc->value)) : "memory");
213 | #else
214 |     asm volatile ("multimem.ld_reduce.relaxed.sys.global.add.f32 %0, [%1];" : "=f"(l2_norm_sum) : "l"(&(partial_l2_norm_mc->value)) : "memory");
215 | #endif
216 | #endif
217 |     *l2_norm = std::sqrt(l2_norm_sum);
218 | }
219 | 
220 | void launch_all_reduce_norm_barrier_kernel(real* __restrict__ const l2_norm,
221 |                                            real_int_pair* __restrict__ partial_l2_norm_uc,
222 |                                            real_int_pair* __restrict__ partial_l2_norm_mc,
223 |                                            const int num_gpus, const int iter,
224 |                                            cudaStream_t stream) {
225 |     // calculating expected count as unsigned for well defined overflow to correctly handle large
226 |     // iteration counts with many GPUs
227 |     unsigned int expected_count = num_gpus;
228 |     // iter starts at 0 so need to scale with iter+1
229 |     expected_count *= (iter + 1);
230 |     all_reduce_norm_barrier_kernel<<<1, 1, 0, stream>>>(l2_norm, partial_l2_norm_uc,
231 |                                                         partial_l2_norm_mc, expected_count);
232 |     CUDA_RT_CALL(cudaGetLastError());
233 | }
234 | 


--------------------------------------------------------------------------------
/multi_threaded_copy/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/multi_threaded_copy/jacobi.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <cmath>
 29 | #include <cstdio>
 30 | #include <iostream>
 31 | #include <sstream>
 32 | #include <cstdlib>
 33 | 
 34 | #include <omp.h>
 35 | 
 36 | #ifdef HAVE_CUB
 37 | #include <cub/block/block_reduce.cuh>
 38 | #endif  // HAVE_CUB
 39 | 
 40 | #ifdef USE_NVTX
 41 | #include <nvtx3/nvToolsExt.h>
 42 | 
 43 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
 44 |                            0x0000ffff, 0x00ff0000, 0x00ffffff};
 45 | const int num_colors = sizeof(colors) / sizeof(uint32_t);
 46 | 
 47 | #define PUSH_RANGE(name, cid)                              \
 48 |     {                                                      \
 49 |         int color_id = cid;                                \
 50 |         color_id = color_id % num_colors;                  \
 51 |         nvtxEventAttributes_t eventAttrib = {0};           \
 52 |         eventAttrib.version = NVTX_VERSION;                \
 53 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
 54 |         eventAttrib.colorType = NVTX_COLOR_ARGB;           \
 55 |         eventAttrib.color = colors[color_id];              \
 56 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
 57 |         eventAttrib.message.ascii = name;                  \
 58 |         nvtxRangePushEx(&eventAttrib);                     \
 59 |     }
 60 | #define POP_RANGE nvtxRangePop();
 61 | #else
 62 | #define PUSH_RANGE(name, cid)
 63 | #define POP_RANGE
 64 | #endif
 65 | 
 66 | #define CUDA_RT_CALL(call)                                                                  \
 67 |     {                                                                                       \
 68 |         cudaError_t cudaStatus = call;                                                      \
 69 |         if (cudaSuccess != cudaStatus) {                                                    \
 70 |             fprintf(stderr,                                                                 \
 71 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 72 |                     "with "                                                                 \
 73 |                     "%s (%d).\n",                                                           \
 74 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 75 |             exit( cudaStatus );                                                             \
 76 |         }                                                                                   \
 77 |     }
 78 | 
 79 | constexpr int MAX_NUM_DEVICES = 32;
 80 | 
 81 | typedef float real;
 82 | constexpr real tol = 1.0e-8;
 83 | 
 84 | const real PI = 2.0 * std::asin(1.0);
 85 | 
 86 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 87 |                                       const real pi, const int offset, const int nx,
 88 |                                       const int my_ny, const int ny) {
 89 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 90 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 91 |         a[iy * nx + 0] = y0;
 92 |         a[iy * nx + (nx - 1)] = y0;
 93 |         a_new[iy * nx + 0] = y0;
 94 |         a_new[iy * nx + (nx - 1)] = y0;
 95 |     }
 96 | }
 97 | 
 98 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 99 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
100 |                               real* __restrict__ const l2_norm, const int iy_start,
101 |                               const int iy_end, const int nx, const bool calculate_norm) {
102 | #ifdef HAVE_CUB
103 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
104 |         BlockReduce;
105 |     __shared__ typename BlockReduce::TempStorage temp_storage;
106 | #endif  // HAVE_CUB
107 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
108 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
109 |     real local_l2_norm = 0.0;
110 | 
111 |     if (iy < iy_end && ix < (nx - 1)) {
112 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
113 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
114 |         a_new[iy * nx + ix] = new_val;
115 |         if (calculate_norm) {
116 |             real residue = new_val - a[iy * nx + ix];
117 |             local_l2_norm += residue * residue;
118 |         }
119 |     }
120 |     if (calculate_norm) {
121 | #ifdef HAVE_CUB
122 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
123 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
124 | #else
125 |         atomicAdd(l2_norm, local_l2_norm);
126 | #endif  // HAVE_CUB
127 |     }
128 | }
129 | 
130 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
131 |                   const int nccheck, const bool print);
132 | 
133 | template <typename T>
134 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
135 |     T argval = default_val;
136 |     char** itr = std::find(begin, end, arg);
137 |     if (itr != end && ++itr != end) {
138 |         std::istringstream inbuf(*itr);
139 |         inbuf >> argval;
140 |     }
141 |     return argval;
142 | }
143 | 
144 | bool get_arg(char** begin, char** end, const std::string& arg) {
145 |     char** itr = std::find(begin, end, arg);
146 |     if (itr != end) {
147 |         return true;
148 |     }
149 |     return false;
150 | }
151 | 
152 | int main(int argc, char* argv[]) {
153 |     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
154 |     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
155 |     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
156 |     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
157 |     const bool csv = get_arg(argv, argv + argc, "-csv");
158 | 
159 |     real* a_new[MAX_NUM_DEVICES];
160 | 
161 |     real* a_ref_h;
162 |     real* a_h;
163 |     double runtime_serial = 0.0;
164 | 
165 |     int iy_end[MAX_NUM_DEVICES];
166 | 
167 |     cudaEvent_t push_top_done[2][MAX_NUM_DEVICES];
168 |     cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES];
169 | 
170 |     bool result_correct = true;
171 |     int num_devices = 0;
172 |     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
173 |     real l2_norm = 1.0;
174 | #pragma omp parallel num_threads(num_devices) shared(l2_norm)
175 |     {
176 |         real* a;
177 | 
178 |         cudaStream_t compute_stream;
179 |         cudaStream_t push_top_stream;
180 |         cudaStream_t push_bottom_stream;
181 |         cudaEvent_t compute_done;
182 | 
183 |         real* l2_norm_d;
184 |         real* l2_norm_h;
185 | 
186 |         int dev_id = omp_get_thread_num();
187 | 
188 |         CUDA_RT_CALL(cudaSetDevice(dev_id));
189 |         CUDA_RT_CALL(cudaFree(0));
190 | 
191 |         if (0 == dev_id) {
192 |             CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
193 |             CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
194 |             runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv);
195 |         }
196 | #pragma omp barrier
197 |         // ny - 2 rows are distributed amongst `size` ranks in such a way
198 |         // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
199 |         // This optimizes load balancing when (ny - 2) % size != 0
200 |         int chunk_size;
201 |         int chunk_size_low = (ny - 2) / num_devices;
202 |         int chunk_size_high = chunk_size_low + 1;
203 |         // To calculate the number of ranks that need to compute an extra row,
204 |         // the following formula is derived from this equation:
205 |         // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
206 |         int num_ranks_low = num_devices * chunk_size_low + num_devices -
207 |                             (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
208 |         if (dev_id < num_ranks_low)
209 |             chunk_size = chunk_size_low;
210 |         else
211 |             chunk_size = chunk_size_high;
212 | 
213 |         CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
214 |         CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size + 2) * sizeof(real)));
215 | 
216 |         CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
217 |         CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size + 2) * sizeof(real)));
218 | 
219 |         // Calculate local domain boundaries
220 |         int iy_start_global;  // My start index in the global array
221 |         if (dev_id < num_ranks_low) {
222 |             iy_start_global = dev_id * chunk_size_low + 1;
223 |         } else {
224 |             iy_start_global =
225 |                 num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
226 |         }
227 | 
228 |         int iy_start = 1;
229 |         iy_end[dev_id] = iy_start + chunk_size;
230 | 
231 |         // Set diriclet boundary conditions on left and right boarder
232 |         initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
233 |             a, a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
234 |         CUDA_RT_CALL(cudaGetLastError());
235 |         CUDA_RT_CALL(cudaDeviceSynchronize());
236 | 
237 |         CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
238 |         CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
239 |         CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
240 |         CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
241 |         CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[0] + dev_id, cudaEventDisableTiming));
242 |         CUDA_RT_CALL(
243 |             cudaEventCreateWithFlags(push_bottom_done[0] + dev_id, cudaEventDisableTiming));
244 |         CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[1] + dev_id, cudaEventDisableTiming));
245 |         CUDA_RT_CALL(
246 |             cudaEventCreateWithFlags(push_bottom_done[1] + dev_id, cudaEventDisableTiming));
247 | 
248 |         CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
249 |         CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
250 | 
251 |         const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
252 |         int canAccessPeer = 0;
253 |         CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
254 |         if (canAccessPeer) {
255 |             CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
256 |         }
257 |         const int bottom = (dev_id + 1) % num_devices;
258 |         if (top != bottom) {
259 |             canAccessPeer = 0;
260 |             CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
261 |             if (canAccessPeer) {
262 |                 CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
263 |             }
264 |         }
265 | 
266 |         for (int i = 0; i < 4; ++i) {
267 |             CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
268 |                                          a_new[dev_id] + iy_start * nx, nx * sizeof(real),
269 |                                          cudaMemcpyDeviceToDevice, push_top_stream));
270 |             CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
271 |                                          nx * sizeof(real), cudaMemcpyDeviceToDevice,
272 |                                          push_bottom_stream));
273 |             CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream));
274 |             CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream));
275 |             std::swap(a_new[dev_id], a);
276 |         }
277 | 
278 |         CUDA_RT_CALL(cudaDeviceSynchronize());
279 | 
280 | #pragma omp master
281 |         {
282 |             if (!csv)
283 |                 printf(
284 |                     "Jacobi relaxation: %d iterations on %d x %d mesh with "
285 |                     "norm check "
286 |                     "every %d iterations\n",
287 |                     iter_max, ny, nx, nccheck);
288 |         }
289 | 
290 |         constexpr int dim_block_x = 32;
291 |         constexpr int dim_block_y = 32;
292 |         dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
293 |                       (ny + (num_devices * dim_block_y) - 1) / (num_devices * dim_block_y), 1);
294 | 
295 |         int iter = 0;
296 |         bool calculate_norm = true;
297 | #pragma omp master
298 |         { l2_norm = 1.0; }
299 | 
300 |         CUDA_RT_CALL(cudaDeviceSynchronize());
301 | #pragma omp barrier
302 |         double start = omp_get_wtime();
303 |         PUSH_RANGE("Jacobi solve", 0)
304 |         while (l2_norm > tol && iter < iter_max) {
305 |             CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
306 | 
307 |             int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
308 |             int bottom = (dev_id + 1) % num_devices;
309 |             calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
310 | 
311 |             CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done[(iter % 2)][bottom], 0));
312 |             CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done[(iter % 2)][top], 0));
313 | 
314 |             jacobi_kernel<dim_block_x, dim_block_y>
315 |                 <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
316 |                     a_new[dev_id], a, l2_norm_d, iy_start, iy_end[dev_id], nx, calculate_norm);
317 |             CUDA_RT_CALL(cudaGetLastError());
318 |             CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
319 | 
320 |             if (calculate_norm) {
321 |                 CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real),
322 |                                              cudaMemcpyDeviceToHost, compute_stream));
323 |             }
324 | 
325 | // Apply periodic boundary conditions need to wait for other threads due to
326 | // std::swap(a_new[dev_id],a);
327 | #pragma omp barrier
328 |             CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
329 |             CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
330 |                                          a_new[dev_id] + iy_start * nx, nx * sizeof(real),
331 |                                          cudaMemcpyDeviceToDevice, push_top_stream));
332 |             CUDA_RT_CALL(cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream));
333 | 
334 |             CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
335 |             CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
336 |                                          nx * sizeof(real), cudaMemcpyDeviceToDevice,
337 |                                          push_bottom_stream));
338 |             CUDA_RT_CALL(
339 |                 cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id], push_bottom_stream));
340 | // Need to wait for other threads as they are reading push_top_done and
341 | // push_bottom_done
342 | #pragma omp barrier
343 | 
344 |             if (calculate_norm) {
345 | #pragma omp single
346 |                 { l2_norm = 0.0; }
347 | #pragma omp barrier
348 |                 CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
349 | #pragma omp atomic
350 |                 l2_norm += *(l2_norm_h);
351 | #pragma omp barrier
352 | #pragma omp single
353 |                 { l2_norm = std::sqrt(l2_norm); }
354 | #pragma omp barrier
355 | 
356 |                 if (!csv && (iter % 100) == 0) {
357 | #pragma omp master
358 |                     printf("%5d, %0.6f\n", iter, l2_norm);
359 |                 }
360 |             }
361 | 
362 |             std::swap(a_new[dev_id], a);
363 |             iter++;
364 |         }
365 |         CUDA_RT_CALL(cudaDeviceSynchronize());
366 | #pragma omp barrier
367 |         double stop = omp_get_wtime();
368 |         POP_RANGE
369 | 
370 |         CUDA_RT_CALL(
371 |             cudaMemcpy(a_h + iy_start_global * nx, a + nx,
372 |                        std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real),
373 |                        cudaMemcpyDeviceToHost));
374 | #pragma omp barrier
375 | 
376 | #pragma omp master
377 |         {
378 |             result_correct = true;
379 |             for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
380 |                 for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
381 |                     if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
382 |                         fprintf(stderr,
383 |                                 "ERROR: a[%d * %d + %d] = %f does not match %f "
384 |                                 "(reference)\n",
385 |                                 iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
386 |                         result_correct = false;
387 |                     }
388 |                 }
389 |             }
390 |             if (result_correct) {
391 |                 if (csv) {
392 |                     printf("multi_threaded_copy, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max,
393 |                            nccheck, num_devices, (stop - start), runtime_serial);
394 |                 } else {
395 |                     printf("Num GPUs: %d.\n", num_devices);
396 |                     printf(
397 |                         "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: "
398 |                         "%8.2f, "
399 |                         "efficiency: %8.2f \n",
400 |                         ny, nx, runtime_serial, num_devices, (stop - start),
401 |                         runtime_serial / (stop - start),
402 |                         runtime_serial / (num_devices * (stop - start)) * 100);
403 |                 }
404 |             }
405 |         }
406 | 
407 |         CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[1][dev_id]));
408 |         CUDA_RT_CALL(cudaEventDestroy(push_top_done[1][dev_id]));
409 |         CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[0][dev_id]));
410 |         CUDA_RT_CALL(cudaEventDestroy(push_top_done[0][dev_id]));
411 |         CUDA_RT_CALL(cudaEventDestroy(compute_done));
412 |         CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
413 |         CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
414 |         CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
415 | 
416 |         CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
417 |         CUDA_RT_CALL(cudaFree(l2_norm_d));
418 | 
419 |         CUDA_RT_CALL(cudaFree(a_new[dev_id]));
420 |         CUDA_RT_CALL(cudaFree(a));
421 | 
422 |         if (0 == dev_id) {
423 |             CUDA_RT_CALL(cudaFreeHost(a_h));
424 |             CUDA_RT_CALL(cudaFreeHost(a_ref_h));
425 |         }
426 |     }
427 | 
428 |     return result_correct ? 0 : 1;
429 | }
430 | 
431 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
432 |                   const int nccheck, const bool print) {
433 |     real* a;
434 |     real* a_new;
435 | 
436 |     cudaStream_t compute_stream;
437 |     cudaStream_t push_top_stream;
438 |     cudaStream_t push_bottom_stream;
439 |     cudaEvent_t compute_done;
440 |     cudaEvent_t push_top_done;
441 |     cudaEvent_t push_bottom_done;
442 | 
443 |     real* l2_norm_d;
444 |     real* l2_norm_h;
445 | 
446 |     int iy_start = 1;
447 |     int iy_end = (ny - 1);
448 | 
449 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
450 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
451 | 
452 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
453 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
454 | 
455 |     // Set diriclet boundary conditions on left and right boarder
456 |     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
457 |     CUDA_RT_CALL(cudaGetLastError());
458 |     CUDA_RT_CALL(cudaDeviceSynchronize());
459 | 
460 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
461 |     CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
462 |     CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
463 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
464 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
465 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
466 | 
467 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
468 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
469 | 
470 |     CUDA_RT_CALL(cudaDeviceSynchronize());
471 | 
472 |     if (print)
473 |         printf(
474 |             "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
475 |             "norm "
476 |             "check every %d iterations\n",
477 |             iter_max, ny, nx, nccheck);
478 | 
479 |     constexpr int dim_block_x = 32;
480 |     constexpr int dim_block_y = 32;
481 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
482 | 
483 |     int iter = 0;
484 |     bool calculate_norm = true;
485 |     real l2_norm = 1.0;
486 | 
487 |     double start = omp_get_wtime();
488 |     PUSH_RANGE("Jacobi solve", 0)
489 |     while (l2_norm > tol && iter < iter_max) {
490 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
491 | 
492 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
493 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
494 | 
495 |         calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
496 |         jacobi_kernel<dim_block_x, dim_block_y>
497 |             <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
498 |                 a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
499 |         CUDA_RT_CALL(cudaGetLastError());
500 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
501 | 
502 |         if (calculate_norm) {
503 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
504 |                                          compute_stream));
505 |         }
506 | 
507 |         // Apply periodic boundary conditions
508 | 
509 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
510 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
511 |                                      cudaMemcpyDeviceToDevice, push_top_stream));
512 |         CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
513 | 
514 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
515 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
516 |                                      cudaMemcpyDeviceToDevice, compute_stream));
517 |         CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
518 | 
519 |         if (calculate_norm) {
520 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
521 |             l2_norm = *l2_norm_h;
522 |             l2_norm = std::sqrt(l2_norm);
523 |             if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
524 |         }
525 | 
526 |         std::swap(a_new, a);
527 |         iter++;
528 |     }
529 |     POP_RANGE
530 |     double stop = omp_get_wtime();
531 | 
532 |     CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
533 | 
534 |     CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
535 |     CUDA_RT_CALL(cudaEventDestroy(push_top_done));
536 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
537 |     CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
538 |     CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
539 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
540 | 
541 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
542 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
543 | 
544 |     CUDA_RT_CALL(cudaFree(a_new));
545 |     CUDA_RT_CALL(cudaFree(a));
546 |     return (stop - start);
547 | }
548 | 


--------------------------------------------------------------------------------
/multi_threaded_copy_overlap/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/multi_threaded_p2p/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/multi_threaded_p2p/jacobi.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <cmath>
 29 | #include <cstdio>
 30 | #include <iostream>
 31 | #include <sstream>
 32 | #include <cstdlib>
 33 | 
 34 | #include <omp.h>
 35 | 
 36 | #ifdef HAVE_CUB
 37 | #include <cub/block/block_reduce.cuh>
 38 | #endif  // HAVE_CUB
 39 | 
 40 | #ifdef USE_NVTX
 41 | #include <nvtx3/nvToolsExt.h>
 42 | 
 43 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
 44 |                            0x0000ffff, 0x00ff0000, 0x00ffffff};
 45 | const int num_colors = sizeof(colors) / sizeof(uint32_t);
 46 | 
 47 | #define PUSH_RANGE(name, cid)                              \
 48 |     {                                                      \
 49 |         int color_id = cid;                                \
 50 |         color_id = color_id % num_colors;                  \
 51 |         nvtxEventAttributes_t eventAttrib = {0};           \
 52 |         eventAttrib.version = NVTX_VERSION;                \
 53 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
 54 |         eventAttrib.colorType = NVTX_COLOR_ARGB;           \
 55 |         eventAttrib.color = colors[color_id];              \
 56 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
 57 |         eventAttrib.message.ascii = name;                  \
 58 |         nvtxRangePushEx(&eventAttrib);                     \
 59 |     }
 60 | #define POP_RANGE nvtxRangePop();
 61 | #else
 62 | #define PUSH_RANGE(name, cid)
 63 | #define POP_RANGE
 64 | #endif
 65 | 
 66 | #define CUDA_RT_CALL(call)                                                                  \
 67 |     {                                                                                       \
 68 |         cudaError_t cudaStatus = call;                                                      \
 69 |         if (cudaSuccess != cudaStatus) {                                                    \
 70 |             fprintf(stderr,                                                                 \
 71 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 72 |                     "with "                                                                 \
 73 |                     "%s (%d).\n",                                                           \
 74 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 75 |             exit( cudaStatus );                                                             \
 76 |         }                                                                                   \
 77 |     }
 78 | 
 79 | constexpr int MAX_NUM_DEVICES = 32;
 80 | 
 81 | typedef float real;
 82 | constexpr real tol = 1.0e-8;
 83 | 
 84 | const real PI = 2.0 * std::asin(1.0);
 85 | 
 86 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 87 |                                       const real pi, const int offset, const int nx,
 88 |                                       const int my_ny, const int ny) {
 89 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 90 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 91 |         a[iy * nx + 0] = y0;
 92 |         a[iy * nx + (nx - 1)] = y0;
 93 |         a_new[iy * nx + 0] = y0;
 94 |         a_new[iy * nx + (nx - 1)] = y0;
 95 |     }
 96 | }
 97 | 
 98 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 99 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
100 |                               real* __restrict__ const l2_norm, const int iy_start,
101 |                               const int iy_end, const int nx, real* __restrict__ const a_new_top,
102 |                               const int top_iy, real* __restrict__ const a_new_bottom,
103 |                               const int bottom_iy, const bool calculate_norm) {
104 | #ifdef HAVE_CUB
105 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
106 |         BlockReduce;
107 |     __shared__ typename BlockReduce::TempStorage temp_storage;
108 | #endif  // HAVE_CUB
109 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
110 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
111 |     real local_l2_norm = 0.0;
112 | 
113 |     if (iy < iy_end && ix < (nx - 1)) {
114 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
115 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
116 |         a_new[iy * nx + ix] = new_val;
117 | 
118 |         if (iy_start == iy) {
119 |             a_new_top[top_iy * nx + ix] = new_val;
120 |         }
121 | 
122 |         if ((iy_end - 1) == iy) {
123 |             a_new_bottom[bottom_iy * nx + ix] = new_val;
124 |         }
125 | 
126 |         if (calculate_norm) {
127 |             real residue = new_val - a[iy * nx + ix];
128 |             local_l2_norm += residue * residue;
129 |         }
130 |     }
131 |     if (calculate_norm) {
132 | #ifdef HAVE_CUB
133 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
134 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
135 | #else
136 |         atomicAdd(l2_norm, local_l2_norm);
137 | #endif  // HAVE_CUB
138 |     }
139 | }
140 | 
141 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
142 |                   const int nccheck, const bool print);
143 | 
144 | template <typename T>
145 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
146 |     T argval = default_val;
147 |     char** itr = std::find(begin, end, arg);
148 |     if (itr != end && ++itr != end) {
149 |         std::istringstream inbuf(*itr);
150 |         inbuf >> argval;
151 |     }
152 |     return argval;
153 | }
154 | 
155 | bool get_arg(char** begin, char** end, const std::string& arg) {
156 |     char** itr = std::find(begin, end, arg);
157 |     if (itr != end) {
158 |         return true;
159 |     }
160 |     return false;
161 | }
162 | 
163 | int main(int argc, char* argv[]) {
164 |     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
165 |     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
166 |     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
167 |     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
168 |     const bool csv = get_arg(argv, argv + argc, "-csv");
169 | 
170 |     real* a_new[MAX_NUM_DEVICES];
171 | 
172 |     real* a_ref_h;
173 |     real* a_h;
174 |     double runtime_serial = 0.0;
175 | 
176 |     int iy_end[MAX_NUM_DEVICES];
177 | 
178 |     cudaEvent_t compute_done[2][MAX_NUM_DEVICES];
179 | 
180 |     bool result_correct = true;
181 |     bool p2p_works = true;
182 |     int num_devices = 0;
183 |     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
184 |     real l2_norm = 1.0;
185 | #pragma omp parallel num_threads(num_devices) shared(l2_norm)
186 |     {
187 |         real* a;
188 | 
189 |         cudaStream_t compute_stream;
190 |         cudaStream_t push_top_stream;
191 |         cudaStream_t push_bottom_stream;
192 |         cudaEvent_t push_top_done;
193 |         cudaEvent_t push_bottom_done;
194 | 
195 |         real* l2_norm_d;
196 |         real* l2_norm_h;
197 | 
198 |         int dev_id = omp_get_thread_num();
199 | 
200 |         CUDA_RT_CALL(cudaSetDevice(dev_id));
201 |         CUDA_RT_CALL(cudaFree(0));
202 | 
203 |         if (0 == dev_id) {
204 |             CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
205 |             CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
206 |             runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv);
207 |         }
208 | #pragma omp barrier
209 |         // ny - 2 rows are distributed amongst `size` ranks in such a way
210 |         // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
211 |         // This optimizes load balancing when (ny - 2) % size != 0
212 |         int chunk_size;
213 |         int chunk_size_low = (ny - 2) / num_devices;
214 |         int chunk_size_high = chunk_size_low + 1;
215 |         // To calculate the number of ranks that need to compute an extra row,
216 |         // the following formula is derived from this equation:
217 |         // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
218 |         int num_ranks_low = num_devices * chunk_size_low + num_devices -
219 |                             (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
220 |         if (dev_id < num_ranks_low)
221 |             chunk_size = chunk_size_low;
222 |         else
223 |             chunk_size = chunk_size_high;
224 | 
225 |         const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
226 |         const int bottom = (dev_id + 1) % num_devices;
227 |         if (top != dev_id) {
228 |             int canAccessPeer = 0;
229 |             CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
230 |             if (canAccessPeer) {
231 |                 CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
232 |             } else {
233 |                 std::cerr << "P2P access required from " << dev_id << " to " << top << std::endl;
234 | #pragma omp critical
235 |                 {
236 |                     if (p2p_works) p2p_works = false;
237 |                 }
238 |             }
239 |             if (top != bottom) {
240 |                 canAccessPeer = 0;
241 |                 CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
242 |                 if (canAccessPeer) {
243 |                     CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
244 |                 } else {
245 |                     std::cerr << "P2P access required from " << dev_id << " to " << bottom
246 |                               << std::endl;
247 | #pragma omp critical
248 |                     {
249 |                         if (p2p_works) p2p_works = false;
250 |                     }
251 |                 }
252 |             }
253 |         }
254 | 
255 | #pragma omp barrier
256 | 
257 |         if (p2p_works) {
258 |             CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
259 |             CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size + 2) * sizeof(real)));
260 | 
261 |             CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
262 |             CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size + 2) * sizeof(real)));
263 | 
264 |             // Calculate local domain boundaries
265 |             int iy_start_global;  // My start index in the global array
266 |             if (dev_id < num_ranks_low) {
267 |                 iy_start_global = dev_id * chunk_size_low + 1;
268 |             } else {
269 |                 iy_start_global =
270 |                     num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
271 |             }
272 |             int iy_end_global =
273 |                 iy_start_global + chunk_size - 1;  // My last index in the global array
274 | 
275 |             int iy_start = 1;
276 |             iy_end[dev_id] = (iy_end_global - iy_start_global + 1) + iy_start;
277 | 
278 |             // Set diriclet boundary conditions on left and right boarder
279 |             initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
280 |                 a, a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
281 |             CUDA_RT_CALL(cudaGetLastError());
282 |             CUDA_RT_CALL(cudaDeviceSynchronize());
283 | 
284 |             CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
285 |             CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
286 |             CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
287 |             CUDA_RT_CALL(
288 |                 cudaEventCreateWithFlags(compute_done[0] + dev_id, cudaEventDisableTiming));
289 |             CUDA_RT_CALL(
290 |                 cudaEventCreateWithFlags(compute_done[1] + dev_id, cudaEventDisableTiming));
291 |             CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
292 |             CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
293 | 
294 |             CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
295 |             CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
296 | 
297 |             CUDA_RT_CALL(cudaDeviceSynchronize());
298 | 
299 | #pragma omp master
300 |             {
301 |                 if (!csv)
302 |                     printf(
303 |                         "Jacobi relaxation: %d iterations on %d x %d mesh with "
304 |                         "norm "
305 |                         "check every %d iterations\n",
306 |                         iter_max, ny, nx, nccheck);
307 |             }
308 | 
309 |             constexpr int dim_block_x = 32;
310 |             constexpr int dim_block_y = 32;
311 |             dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
312 |                           (ny + (num_devices * dim_block_y) - 1) / (num_devices * dim_block_y), 1);
313 | 
314 |             int iter = 0;
315 |             bool calculate_norm = true;
316 | #pragma omp master
317 |             { l2_norm = 1.0; }
318 | 
319 |             CUDA_RT_CALL(cudaDeviceSynchronize());
320 | #pragma omp barrier
321 |             double start = omp_get_wtime();
322 |             PUSH_RANGE("Jacobi solve", 0)
323 |             while (l2_norm > tol && iter < iter_max) {
324 |                 CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
325 | 
326 | // need to wait for other threads due to sharing of a_new and compute_done
327 | // between threads
328 | #pragma omp barrier
329 |                 CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, compute_done[iter % 2][top], 0));
330 |                 CUDA_RT_CALL(
331 |                     cudaStreamWaitEvent(compute_stream, compute_done[iter % 2][bottom], 0));
332 | 
333 |                 calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
334 |                 jacobi_kernel<dim_block_x, dim_block_y>
335 |                     <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
336 |                         a_new[dev_id], a, l2_norm_d, iy_start, iy_end[dev_id], nx, a_new[top],
337 |                         iy_end[top], a_new[bottom], 0, calculate_norm);
338 |                 CUDA_RT_CALL(cudaGetLastError());
339 |                 CUDA_RT_CALL(cudaEventRecord(compute_done[(iter + 1) % 2][dev_id], compute_stream));
340 | 
341 |                 if (calculate_norm) {
342 |                     CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real),
343 |                                                  cudaMemcpyDeviceToHost, compute_stream));
344 | #pragma omp barrier
345 | #pragma omp single
346 |                     { l2_norm = 0.0; }
347 | #pragma omp barrier
348 |                     CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
349 | #pragma omp atomic
350 |                     l2_norm += *(l2_norm_h);
351 | #pragma omp barrier
352 | #pragma omp single
353 |                     { l2_norm = std::sqrt(l2_norm); }
354 | #pragma omp barrier
355 |                     if (!csv && (iter % 100) == 0) {
356 | #pragma omp master
357 |                         printf("%5d, %0.6f\n", iter, l2_norm);
358 |                     }
359 |                 }
360 | 
361 | #pragma omp barrier
362 |                 std::swap(a_new[dev_id], a);
363 |                 iter++;
364 |             }
365 |             CUDA_RT_CALL(cudaDeviceSynchronize());
366 | #pragma omp barrier
367 |             double stop = omp_get_wtime();
368 |             POP_RANGE
369 | 
370 |             CUDA_RT_CALL(
371 |                 cudaMemcpy(a_h + iy_start_global * nx, a + nx,
372 |                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real),
373 |                            cudaMemcpyDeviceToHost));
374 | #pragma omp barrier
375 | 
376 | #pragma omp master
377 |             {
378 |                 result_correct = true;
379 |                 for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
380 |                     for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
381 |                         if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
382 |                             fprintf(stderr,
383 |                                     "ERROR: a[%d * %d + %d] = %f does not "
384 |                                     "match %f (reference)\n",
385 |                                     iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
386 |                             result_correct = false;
387 |                         }
388 |                     }
389 |                 }
390 |                 if (result_correct) {
391 |                     if (csv) {
392 |                         printf(
393 |                             "multi_threaded_p2p, %d, %d, %d, %d, %d, 1, %f, "
394 |                             "%f\n",
395 |                             nx, ny, iter_max, nccheck, num_devices, (stop - start), runtime_serial);
396 |                     } else {
397 |                         printf("Num GPUs: %d.\n", num_devices);
398 |                         printf(
399 |                             "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: "
400 |                             "%8.2f, "
401 |                             "efficiency: %8.2f \n",
402 |                             ny, nx, runtime_serial, num_devices, (stop - start),
403 |                             runtime_serial / (stop - start),
404 |                             runtime_serial / (num_devices * (stop - start)) * 100);
405 |                     }
406 |                 }
407 |             }
408 | 
409 |             CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
410 |             CUDA_RT_CALL(cudaEventDestroy(push_top_done));
411 |             CUDA_RT_CALL(cudaEventDestroy(compute_done[1][dev_id]));
412 |             CUDA_RT_CALL(cudaEventDestroy(compute_done[0][dev_id]));
413 |             CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
414 |             CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
415 |             CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
416 | 
417 |             CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
418 |             CUDA_RT_CALL(cudaFree(l2_norm_d));
419 | 
420 |             CUDA_RT_CALL(cudaFree(a_new[dev_id]));
421 |             CUDA_RT_CALL(cudaFree(a));
422 |             if (0 == dev_id) {
423 |                 CUDA_RT_CALL(cudaFreeHost(a_h));
424 |                 CUDA_RT_CALL(cudaFreeHost(a_ref_h));
425 |             }
426 |         }
427 |     }
428 | 
429 |     return result_correct ? 0 : 1;
430 | }
431 | 
432 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
433 |                   const int nccheck, const bool print) {
434 |     real* a;
435 |     real* a_new;
436 | 
437 |     cudaStream_t compute_stream;
438 |     cudaStream_t push_top_stream;
439 |     cudaStream_t push_bottom_stream;
440 |     cudaEvent_t compute_done;
441 |     cudaEvent_t push_top_done;
442 |     cudaEvent_t push_bottom_done;
443 | 
444 |     real* l2_norm_d;
445 |     real* l2_norm_h;
446 | 
447 |     int iy_start = 1;
448 |     int iy_end = (ny - 1);
449 | 
450 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
451 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
452 | 
453 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
454 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
455 | 
456 |     // Set diriclet boundary conditions on left and right boarder
457 |     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
458 |     CUDA_RT_CALL(cudaGetLastError());
459 |     CUDA_RT_CALL(cudaDeviceSynchronize());
460 | 
461 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
462 |     CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
463 |     CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
464 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
465 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
466 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
467 | 
468 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
469 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
470 | 
471 |     CUDA_RT_CALL(cudaDeviceSynchronize());
472 | 
473 |     if (print)
474 |         printf(
475 |             "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
476 |             "norm "
477 |             "check every %d iterations\n",
478 |             iter_max, ny, nx, nccheck);
479 | 
480 |     constexpr int dim_block_x = 32;
481 |     constexpr int dim_block_y = 32;
482 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
483 | 
484 |     int iter = 0;
485 |     bool calculate_norm = true;
486 |     real l2_norm = 1.0;
487 | 
488 |     CUDA_RT_CALL(cudaDeviceSynchronize());
489 |     double start = omp_get_wtime();
490 |     PUSH_RANGE("Jacobi solve", 0)
491 |     while (l2_norm > tol && iter < iter_max) {
492 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
493 | 
494 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
495 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
496 | 
497 |         calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
498 |         jacobi_kernel<dim_block_x, dim_block_y>
499 |             <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
500 |                 a_new, a, l2_norm_d, iy_start, iy_end, nx, a_new, iy_start, a_new, (iy_end - 1),
501 |                 calculate_norm);
502 |         CUDA_RT_CALL(cudaGetLastError());
503 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
504 | 
505 |         if (calculate_norm) {
506 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
507 |                                          compute_stream));
508 |         }
509 | 
510 |         // Apply periodic boundary conditions
511 | 
512 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
513 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
514 |                                      cudaMemcpyDeviceToDevice, push_top_stream));
515 |         CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
516 | 
517 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
518 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
519 |                                      cudaMemcpyDeviceToDevice, compute_stream));
520 |         CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
521 | 
522 |         if (calculate_norm) {
523 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
524 |             l2_norm = *l2_norm_h;
525 |             l2_norm = std::sqrt(l2_norm);
526 |             if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
527 |         }
528 | 
529 |         std::swap(a_new, a);
530 |         iter++;
531 |     }
532 |     CUDA_RT_CALL(cudaDeviceSynchronize());
533 |     POP_RANGE
534 |     double stop = omp_get_wtime();
535 | 
536 |     CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
537 | 
538 |     CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
539 |     CUDA_RT_CALL(cudaEventDestroy(push_top_done));
540 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
541 |     CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
542 |     CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
543 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
544 | 
545 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
546 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
547 | 
548 |     CUDA_RT_CALL(cudaFree(a_new));
549 |     CUDA_RT_CALL(cudaFree(a));
550 |     return (stop - start);
551 | }
552 | 


--------------------------------------------------------------------------------
/multi_threaded_p2p_opt/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/multi_threaded_um/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60	:= -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70	:= -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/multi_threaded_um/jacobi.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <cmath>
 29 | #include <cstdio>
 30 | #include <iostream>
 31 | #include <sstream>
 32 | #include <cstdlib>
 33 | 
 34 | #include <omp.h>
 35 | 
 36 | #ifdef HAVE_CUB
 37 | #include <cub/block/block_reduce.cuh>
 38 | #endif  // HAVE_CUB
 39 | 
 40 | #ifdef USE_NVTX
 41 | #include <nvtx3/nvToolsExt.h>
 42 | 
 43 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
 44 |                            0x0000ffff, 0x00ff0000, 0x00ffffff};
 45 | const int num_colors = sizeof(colors) / sizeof(uint32_t);
 46 | 
 47 | #define PUSH_RANGE(name, cid)                              \
 48 |     {                                                      \
 49 |         int color_id = cid;                                \
 50 |         color_id = color_id % num_colors;                  \
 51 |         nvtxEventAttributes_t eventAttrib = {0};           \
 52 |         eventAttrib.version = NVTX_VERSION;                \
 53 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
 54 |         eventAttrib.colorType = NVTX_COLOR_ARGB;           \
 55 |         eventAttrib.color = colors[color_id];              \
 56 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
 57 |         eventAttrib.message.ascii = name;                  \
 58 |         nvtxRangePushEx(&eventAttrib);                     \
 59 |     }
 60 | #define POP_RANGE nvtxRangePop();
 61 | #else
 62 | #define PUSH_RANGE(name, cid)
 63 | #define POP_RANGE
 64 | #endif
 65 | 
 66 | #define CUDA_RT_CALL(call)                                                                  \
 67 |     {                                                                                       \
 68 |         cudaError_t cudaStatus = call;                                                      \
 69 |         if (cudaSuccess != cudaStatus) {                                                    \
 70 |             fprintf(stderr,                                                                 \
 71 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 72 |                     "with "                                                                 \
 73 |                     "%s (%d).\n",                                                           \
 74 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 75 |             exit( cudaStatus );                                                             \
 76 |         }                                                                                   \
 77 |     }
 78 | 
 79 | constexpr int MAX_NUM_DEVICES = 32;
 80 | 
 81 | typedef float real;
 82 | constexpr real tol = 1.0e-8;
 83 | 
 84 | const real PI = 2.0 * std::asin(1.0);
 85 | 
 86 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 87 |                                       const real pi, const int nx, const int ny) {
 88 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < ny; iy += blockDim.x * gridDim.x) {
 89 |         const real y0 = sin(2.0 * pi * iy / (ny - 1));
 90 |         a[iy * nx + 0] = y0;
 91 |         a[iy * nx + (nx - 1)] = y0;
 92 |         a_new[iy * nx + 0] = y0;
 93 |         a_new[iy * nx + (nx - 1)] = y0;
 94 |     }
 95 | }
 96 | 
 97 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 98 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 99 |                               real* __restrict__ const l2_norm, const int iy_start,
100 |                               const int iy_end, const int nx, const int ny,
101 |                               const bool calculate_norm) {
102 | #ifdef HAVE_CUB
103 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
104 |         BlockReduce;
105 |     __shared__ typename BlockReduce::TempStorage temp_storage;
106 | #endif  // HAVE_CUB
107 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
108 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
109 |     real local_l2_norm = 0.0;
110 | 
111 |     if (iy < iy_end && ix < (nx - 1)) {
112 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
113 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
114 |         a_new[iy * nx + ix] = new_val;
115 |         if (1 == iy) {
116 |             a_new[(ny - 1) * nx + ix] = new_val;
117 |         }
118 |         if ((ny - 2) == iy) {
119 |             a_new[0 * nx + ix] = new_val;
120 |         }
121 | 
122 |         if (calculate_norm) {
123 |             real residue = new_val - a[iy * nx + ix];
124 |             local_l2_norm += residue * residue;
125 |         }
126 |     }
127 |     if (calculate_norm) {
128 | #ifdef HAVE_CUB
129 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
130 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
131 | #else
132 |         atomicAdd(l2_norm, local_l2_norm);
133 | #endif  // HAVE_CUB
134 |     }
135 | }
136 | 
137 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref,
138 |                   const int nccheck, const bool print);
139 | 
140 | template <typename T>
141 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
142 |     T argval = default_val;
143 |     char** itr = std::find(begin, end, arg);
144 |     if (itr != end && ++itr != end) {
145 |         std::istringstream inbuf(*itr);
146 |         inbuf >> argval;
147 |     }
148 |     return argval;
149 | }
150 | 
151 | bool get_arg(char** begin, char** end, const std::string& arg) {
152 |     char** itr = std::find(begin, end, arg);
153 |     if (itr != end) {
154 |         return true;
155 |     }
156 |     return false;
157 | }
158 | 
159 | int main(int argc, char* argv[]) {
160 |     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
161 |     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
162 |     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
163 |     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
164 |     const bool csv = get_arg(argv, argv + argc, "-csv");
165 | 
166 |     real* a;
167 |     real* a_new;
168 | 
169 |     real* a_ref;
170 |     double runtime_serial = 0.0;
171 | 
172 |     CUDA_RT_CALL(cudaSetDevice(0));
173 |     CUDA_RT_CALL(cudaFree(0));
174 | 
175 |     CUDA_RT_CALL(cudaMallocManaged(&a_ref, nx * ny * sizeof(real)));
176 |     runtime_serial = single_gpu(nx, ny, iter_max, a_ref, nccheck, !csv);
177 | 
178 |     CUDA_RT_CALL(cudaMallocManaged(&a, nx * ny * sizeof(real)));
179 |     CUDA_RT_CALL(cudaMallocManaged(&a_new, nx * ny * sizeof(real)));
180 | 
181 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
182 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
183 | 
184 |     // Set diriclet boundary conditions on left and right boarder
185 |     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, nx, ny);
186 |     CUDA_RT_CALL(cudaGetLastError());
187 |     CUDA_RT_CALL(cudaDeviceSynchronize());
188 | 
189 |     if (!csv)
190 |         printf(
191 |             "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
192 |             "every %d iterations\n",
193 |             iter_max, ny, nx, nccheck);
194 | 
195 |     real l2_norm = 1.0;
196 | 
197 |     cudaEvent_t compute_done[2][MAX_NUM_DEVICES];
198 | 
199 |     bool result_correct = true;
200 |     int num_devices = 0;
201 |     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
202 | #pragma omp parallel num_threads(num_devices) firstprivate(a, a_new)
203 |     {
204 |         int dev_id = omp_get_thread_num();
205 | 
206 |         CUDA_RT_CALL(cudaSetDevice(dev_id));
207 |         CUDA_RT_CALL(cudaFree(0));
208 | 
209 |         // ny - 2 rows are distributed amongst `size` ranks in such a way
210 |         // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
211 |         // This optimizes load balancing when (ny - 2) % size != 0
212 |         int chunk_size;
213 |         int chunk_size_low = ny / num_devices;
214 |         int chunk_size_high = chunk_size_low + 1;
215 |         // To calculate the number of ranks that need to compute an extra row,
216 |         // the following formula is derived from this equation:
217 |         // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
218 |         int num_ranks_low = num_devices * chunk_size_low + num_devices -
219 |                             ny;  // Number of ranks with chunk_size = chunk_size_low
220 |         if (dev_id < num_ranks_low)
221 |             chunk_size = chunk_size_low;
222 |         else
223 |             chunk_size = chunk_size_high;
224 | 
225 |         // Calculate local domain boundaries
226 |         int iy_start;
227 |         if (dev_id < num_ranks_low) {
228 |             iy_start = dev_id * chunk_size_low;
229 |         } else {
230 |             iy_start = num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high;
231 |         }
232 |         int iy_end = iy_start + chunk_size;
233 |         // Do not process boundaries
234 |         iy_start = std::max(iy_start, 1);
235 |         iy_end = std::min(iy_end, ny - 1);
236 | 
237 |         const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
238 |         int canAccessPeer = 0;
239 |         CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
240 |         if (canAccessPeer) {
241 |             CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
242 |         }
243 |         const int bottom = (dev_id + 1) % num_devices;
244 |         canAccessPeer = 0;
245 |         CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
246 |         if (top != bottom && canAccessPeer) {
247 |             CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
248 |         }
249 | 
250 | #ifdef UM_HINTS
251 |         CUDA_RT_CALL(cudaMemAdvise(a + iy_start * nx, (iy_end - iy_start) * nx * sizeof(real),
252 |                                    cudaMemAdviseSetPreferredLocation, dev_id));
253 |         CUDA_RT_CALL(cudaMemAdvise(a + (iy_start - 1) * nx, nx * sizeof(real),
254 |                                    cudaMemAdviseSetAccessedBy, dev_id));
255 |         CUDA_RT_CALL(
256 |             cudaMemAdvise(a + iy_end * nx, nx * sizeof(real), cudaMemAdviseSetAccessedBy, dev_id));
257 |         CUDA_RT_CALL(cudaMemAdvise(a_new + iy_start * nx, (iy_end - iy_start) * nx * sizeof(real),
258 |                                    cudaMemAdviseSetPreferredLocation, dev_id));
259 |         CUDA_RT_CALL(cudaMemAdvise(a_new + (iy_start - 1) * nx, nx * sizeof(real),
260 |                                    cudaMemAdviseSetAccessedBy, dev_id));
261 |         CUDA_RT_CALL(cudaMemAdvise(a_new + iy_end * nx, nx * sizeof(real),
262 |                                    cudaMemAdviseSetAccessedBy, dev_id));
263 | #endif  // UM_HINTS
264 | 
265 |         constexpr int dim_block_x = 32;
266 |         constexpr int dim_block_y = 32;
267 |         dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
268 |                       (ny + (num_devices * dim_block_y) - 1) / (num_devices * dim_block_y), 1);
269 | 
270 |         real* l2_norm_d;
271 |         real* l2_norm_h;
272 | 
273 |         CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
274 |         CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
275 | 
276 |         CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done[0] + dev_id, cudaEventDisableTiming));
277 |         CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done[1] + dev_id, cudaEventDisableTiming));
278 | 
279 |         CUDA_RT_CALL(cudaDeviceSynchronize());
280 | 
281 |         int iter = 0;
282 |         bool calculate_norm = true;
283 | #pragma omp master
284 |         { l2_norm = 1.0; }
285 | 
286 | #pragma omp barrier
287 |         double start = omp_get_wtime();
288 |         PUSH_RANGE("Jacobi solve", 0)
289 |         while (l2_norm > tol && iter < iter_max) {
290 |             CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), 0));
291 | 
292 | // need to wait for other threads due to sharing of a, a_new and compute_done
293 | // between threads
294 | #pragma omp barrier
295 |             CUDA_RT_CALL(cudaStreamWaitEvent(0, compute_done[iter % 2][top], 0));
296 |             CUDA_RT_CALL(cudaStreamWaitEvent(0, compute_done[iter % 2][bottom], 0));
297 | 
298 |             calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
299 |             jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}>>>(
300 |                 a_new, a, l2_norm_d, iy_start, iy_end, nx, ny, calculate_norm);
301 |             CUDA_RT_CALL(cudaGetLastError());
302 |             CUDA_RT_CALL(cudaEventRecord(compute_done[(iter + 1) % 2][dev_id], 0));
303 | #pragma omp barrier
304 | 
305 |             if (calculate_norm) {
306 |                 CUDA_RT_CALL(
307 |                     cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 0));
308 | #pragma omp barrier
309 | #pragma omp single
310 |                 { l2_norm = 0.0; }
311 | #pragma omp barrier
312 |                 CUDA_RT_CALL(cudaStreamSynchronize(0));
313 | #pragma omp atomic
314 |                 l2_norm += *(l2_norm_h);
315 | #pragma omp barrier
316 | #pragma omp single
317 |                 { l2_norm = std::sqrt(l2_norm); }
318 | #pragma omp barrier
319 |                 if (!csv && (iter % 100) == 0) {
320 | #pragma omp master
321 |                     printf("%5d, %0.6f\n", iter, l2_norm);
322 |                 }
323 |             }
324 | 
325 |             std::swap(a_new, a);
326 |             iter++;
327 |         }
328 |         CUDA_RT_CALL(cudaDeviceSynchronize());
329 | #pragma omp barrier
330 |         POP_RANGE
331 |         double stop = omp_get_wtime();
332 | 
333 | #pragma omp barrier
334 | 
335 | #pragma omp master
336 |         {
337 |             result_correct = true;
338 |             for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
339 |                 for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
340 |                     if (std::fabs(a_ref[iy * nx + ix] - a[iy * nx + ix]) > tol) {
341 |                         fprintf(stderr,
342 |                                 "ERROR: a[%d * %d + %d] = %f does not match %f "
343 |                                 "(reference)\n",
344 |                                 iy, nx, ix, a[iy * nx + ix], a_ref[iy * nx + ix]);
345 |                         result_correct = false;
346 |                     }
347 |                 }
348 |             }
349 |             if (result_correct) {
350 |                 if (csv) {
351 |                     printf("multi_threaded_um, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max,
352 |                            nccheck, num_devices, (stop - start), runtime_serial);
353 |                 } else {
354 |                     printf("Num GPUs: %d.\n", num_devices);
355 |                     printf(
356 |                         "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: "
357 |                         "%8.2f, "
358 |                         "efficiency: %8.2f \n",
359 |                         ny, nx, runtime_serial, num_devices, (stop - start),
360 |                         runtime_serial / (stop - start),
361 |                         runtime_serial / (num_devices * (stop - start)) * 100);
362 |                 }
363 |             }
364 |         }
365 | 
366 |         CUDA_RT_CALL(cudaEventDestroy(compute_done[1][dev_id]));
367 |         CUDA_RT_CALL(cudaEventDestroy(compute_done[0][dev_id]));
368 | 
369 |         CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
370 |         CUDA_RT_CALL(cudaFree(l2_norm_d));
371 |         CUDA_RT_CALL(cudaDeviceSynchronize());
372 |     }
373 | 
374 |     CUDA_RT_CALL(cudaFree(a_new));
375 |     CUDA_RT_CALL(cudaFree(a));
376 | 
377 |     CUDA_RT_CALL(cudaFree(a_ref));
378 | 
379 |     return result_correct ? 0 : 1;
380 | }
381 | 
382 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
383 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
384 |                               real* __restrict__ const l2_norm, const int iy_start,
385 |                               const int iy_end, const int nx, const bool calculate_norm) {
386 | #ifdef HAVE_CUB
387 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
388 |         BlockReduce;
389 |     __shared__ typename BlockReduce::TempStorage temp_storage;
390 | #endif  // HAVE_CUB
391 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
392 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
393 |     real local_l2_norm = 0.0;
394 | 
395 |     if (iy < iy_end && ix < (nx - 1)) {
396 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
397 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
398 |         a_new[iy * nx + ix] = new_val;
399 | 
400 |         if (calculate_norm) {
401 |             real residue = new_val - a[iy * nx + ix];
402 |             local_l2_norm += residue * residue;
403 |         }
404 |     }
405 |     if (calculate_norm) {
406 | #ifdef HAVE_CUB
407 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
408 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
409 | #else
410 |         atomicAdd(l2_norm, local_l2_norm);
411 | #endif  // HAVE_CUB
412 |     }
413 | }
414 | 
415 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref,
416 |                   const int nccheck, const bool print) {
417 |     real* a;
418 |     real* a_new;
419 | 
420 |     cudaStream_t compute_stream;
421 |     cudaStream_t push_top_stream;
422 |     cudaStream_t push_bottom_stream;
423 |     cudaEvent_t compute_done;
424 |     cudaEvent_t push_top_done;
425 |     cudaEvent_t push_bottom_done;
426 | 
427 |     real* l2_norm_d;
428 |     real* l2_norm_h;
429 | 
430 |     int iy_start = 1;
431 |     int iy_end = (ny - 1);
432 | 
433 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
434 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
435 | 
436 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
437 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
438 | 
439 |     // Set diriclet boundary conditions on left and right boarder
440 |     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, nx, ny);
441 |     CUDA_RT_CALL(cudaGetLastError());
442 |     CUDA_RT_CALL(cudaDeviceSynchronize());
443 | 
444 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
445 |     CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
446 |     CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
447 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
448 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
449 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
450 | 
451 |     CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
452 |     CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
453 | 
454 |     CUDA_RT_CALL(cudaDeviceSynchronize());
455 | 
456 |     if (print)
457 |         printf(
458 |             "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
459 |             "norm "
460 |             "check every %d iterations\n",
461 |             iter_max, ny, nx, nccheck);
462 | 
463 |     constexpr int dim_block_x = 32;
464 |     constexpr int dim_block_y = 32;
465 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
466 | 
467 |     int iter = 0;
468 |     bool calculate_norm = true;
469 |     real l2_norm = 1.0;
470 | 
471 |     double start = omp_get_wtime();
472 |     PUSH_RANGE("Jacobi solve", 0)
473 |     while (l2_norm > tol && iter < iter_max) {
474 |         CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
475 | 
476 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
477 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
478 | 
479 |         calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
480 | 
481 |         jacobi_kernel<dim_block_x, dim_block_y>
482 |             <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
483 |                 a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
484 |         CUDA_RT_CALL(cudaGetLastError());
485 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
486 | 
487 |         if (calculate_norm) {
488 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
489 |                                          compute_stream));
490 |         }
491 | 
492 |         // Apply periodic boundary conditions
493 | 
494 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
495 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
496 |                                      cudaMemcpyDeviceToDevice, push_top_stream));
497 |         CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
498 | 
499 |         CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
500 |         CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
501 |                                      cudaMemcpyDeviceToDevice, compute_stream));
502 |         CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
503 | 
504 |         if (calculate_norm) {
505 |             CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
506 |             l2_norm = *l2_norm_h;
507 |             l2_norm = std::sqrt(l2_norm);
508 |             if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
509 |         }
510 | 
511 |         std::swap(a_new, a);
512 |         iter++;
513 |     }
514 |     POP_RANGE
515 |     double stop = omp_get_wtime();
516 | 
517 |     CUDA_RT_CALL(cudaMemcpy(a_ref, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
518 | 
519 |     CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
520 |     CUDA_RT_CALL(cudaEventDestroy(push_top_done));
521 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
522 |     CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
523 |     CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
524 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
525 | 
526 |     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
527 |     CUDA_RT_CALL(cudaFree(l2_norm_d));
528 | 
529 |     CUDA_RT_CALL(cudaFree(a_new));
530 |     CUDA_RT_CALL(cudaFree(a));
531 |     return (stop - start);
532 | }
533 | 


--------------------------------------------------------------------------------
/nccl/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPICXX=mpicxx
 5 | MPIRUN ?= mpirun
 6 | CUDA_HOME ?= /usr/local/cuda
 7 | NCCL_HOME ?= /usr
 8 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 9 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
10 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
11 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
12 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
13 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
14 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
15 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
16 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
17 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
18 | ifdef DISABLE_CUB
19 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
20 | else
21 |         NVCC_FLAGS = -DHAVE_CUB
22 | endif
23 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
24 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14
25 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib -lcudart -ldl -lnccl
26 | jacobi: Makefile jacobi.cpp jacobi_kernels.o
27 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
28 | 
29 | jacobi_kernels.o: Makefile jacobi_kernels.cu
30 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
31 | 
32 | .PHONY.: clean
33 | clean:
34 | 	rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log
35 | 
36 | sanitize: jacobi
37 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
38 | 
39 | run: jacobi
40 | 	$(MPIRUN) -np $(NP) ./jacobi
41 | 
42 | profile: jacobi
43 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
44 | 


--------------------------------------------------------------------------------
/nccl/jacobi_kernels.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <cstdio>
 28 | #include <cstdlib>
 29 | 
 30 | #ifdef HAVE_CUB
 31 | #include <cub/block/block_reduce.cuh>
 32 | #endif  // HAVE_CUB
 33 | 
 34 | #define CUDA_RT_CALL(call)                                                                  \
 35 |     {                                                                                       \
 36 |         cudaError_t cudaStatus = call;                                                      \
 37 |         if (cudaSuccess != cudaStatus) {                                                    \
 38 |             fprintf(stderr,                                                                 \
 39 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 40 |                     "with "                                                                 \
 41 |                     "%s (%d).\n",                                                           \
 42 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 43 |             exit( cudaStatus );                                                             \
 44 |         }                                                                                   \
 45 |     }
 46 | 
 47 | #ifdef USE_DOUBLE
 48 | typedef double real;
 49 | #define MPI_REAL_TYPE MPI_DOUBLE
 50 | #else
 51 | typedef float real;
 52 | #define MPI_REAL_TYPE MPI_FLOAT
 53 | #endif
 54 | 
 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 56 |                                       const real pi, const int offset, const int nx,
 57 |                                       const int my_ny, const int ny) {
 58 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 59 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 60 |         a[iy * nx + 0] = y0;
 61 |         a[iy * nx + (nx - 1)] = y0;
 62 |         a_new[iy * nx + 0] = y0;
 63 |         a_new[iy * nx + (nx - 1)] = y0;
 64 |     }
 65 | }
 66 | 
 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 68 |                                   const real pi, const int offset, const int nx, const int my_ny,
 69 |                                   const int ny) {
 70 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
 71 |     CUDA_RT_CALL(cudaGetLastError());
 72 | }
 73 | 
 74 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 76 |                               real* __restrict__ const l2_norm, const int iy_start,
 77 |                               const int iy_end, const int nx, const bool calculate_norm) {
 78 | #ifdef HAVE_CUB
 79 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
 80 |         BlockReduce;
 81 |     __shared__ typename BlockReduce::TempStorage temp_storage;
 82 | #endif  // HAVE_CUB
 83 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 84 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 85 |     real local_l2_norm = 0.0;
 86 | 
 87 |     if (iy < iy_end && ix < (nx - 1)) {
 88 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 89 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 90 |         a_new[iy * nx + ix] = new_val;
 91 |         if (calculate_norm) {
 92 |             real residue = new_val - a[iy * nx + ix];
 93 |             local_l2_norm += residue * residue;
 94 |         }
 95 |     }
 96 |     if (calculate_norm) {
 97 | #ifdef HAVE_CUB
 98 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
 99 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
100 | #else
101 |         atomicAdd(l2_norm, local_l2_norm);
102 | #endif  // HAVE_CUB
103 |     }
104 | }
105 | 
106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
107 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
108 |                           const int nx, const bool calculate_norm, cudaStream_t stream) {
109 |     constexpr int dim_block_x = 32;
110 |     constexpr int dim_block_y = 32;
111 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
112 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
113 |     jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
114 |         a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
115 |     CUDA_RT_CALL(cudaGetLastError());
116 | }
117 | 


--------------------------------------------------------------------------------
/nccl_graphs/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021,2022, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPICXX=mpicxx
 5 | MPIRUN ?= mpirun
 6 | CUDA_HOME ?= /usr/local/cuda
 7 | NCCL_HOME ?= /usr
 8 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 9 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
10 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
11 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
12 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
13 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
14 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
15 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
16 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
17 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
18 | ifdef DISABLE_CUB
19 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
20 | else
21 |         NVCC_FLAGS = -DHAVE_CUB
22 | endif
23 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
24 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14
25 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib -lcudart -ldl -lnccl
26 | jacobi: Makefile jacobi.cpp jacobi_kernels.o
27 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
28 | 
29 | jacobi_kernels.o: Makefile jacobi_kernels.cu
30 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
31 | 
32 | .PHONY.: clean
33 | clean:
34 | 	rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log
35 | 
36 | sanitize: jacobi
37 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
38 | 
39 | run: jacobi
40 | 	$(MPIRUN) -np $(NP) ./jacobi
41 | 
42 | profile: jacobi
43 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx --cuda-graph-trace=node -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
44 | 


--------------------------------------------------------------------------------
/nccl_graphs/jacobi_kernels.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2021,2022, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <cstdio>
 28 | 
 29 | #ifdef HAVE_CUB
 30 | #include <cub/block/block_reduce.cuh>
 31 | #endif  // HAVE_CUB
 32 | 
 33 | #define CUDA_RT_CALL(call)                                                                  \
 34 |     {                                                                                       \
 35 |         cudaError_t cudaStatus = call;                                                      \
 36 |         if (cudaSuccess != cudaStatus) {                                                    \
 37 |             fprintf(stderr,                                                                 \
 38 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 39 |                     "with "                                                                 \
 40 |                     "%s (%d).\n",                                                           \
 41 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 42 |             exit( cudaStatus );                                                             \
 43 |         }                                                                                   \
 44 |     }
 45 | 
 46 | #ifdef USE_DOUBLE
 47 | typedef double real;
 48 | #define MPI_REAL_TYPE MPI_DOUBLE
 49 | #else
 50 | typedef float real;
 51 | #define MPI_REAL_TYPE MPI_FLOAT
 52 | #endif
 53 | 
 54 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 55 |                                       const real pi, const int offset, const int nx,
 56 |                                       const int my_ny, const int ny) {
 57 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 58 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 59 |         a[iy * nx + 0] = y0;
 60 |         a[iy * nx + (nx - 1)] = y0;
 61 |         a_new[iy * nx + 0] = y0;
 62 |         a_new[iy * nx + (nx - 1)] = y0;
 63 |     }
 64 | }
 65 | 
 66 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 67 |                                   const real pi, const int offset, const int nx, const int my_ny,
 68 |                                   const int ny) {
 69 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
 70 |     CUDA_RT_CALL(cudaGetLastError());
 71 | }
 72 | 
 73 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 74 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 75 |                               real* __restrict__ const l2_norm, const int iy_start,
 76 |                               const int iy_end, const int nx, const bool calculate_norm) {
 77 | #ifdef HAVE_CUB
 78 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
 79 |         BlockReduce;
 80 |     __shared__ typename BlockReduce::TempStorage temp_storage;
 81 | #endif  // HAVE_CUB
 82 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 83 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 84 |     real local_l2_norm = 0.0;
 85 | 
 86 |     if (iy < iy_end && ix < (nx - 1)) {
 87 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 88 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 89 |         a_new[iy * nx + ix] = new_val;
 90 |         if (calculate_norm) {
 91 |             real residue = new_val - a[iy * nx + ix];
 92 |             local_l2_norm += residue * residue;
 93 |         }
 94 |     }
 95 |     if (calculate_norm) {
 96 | #ifdef HAVE_CUB
 97 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
 98 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
 99 | #else
100 |         atomicAdd(l2_norm, local_l2_norm);
101 | #endif  // HAVE_CUB
102 |     }
103 | }
104 | 
105 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
106 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
107 |                           const int nx, const bool calculate_norm, cudaStream_t stream) {
108 |     constexpr int dim_block_x = 32;
109 |     constexpr int dim_block_y = 32;
110 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
111 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
112 |     jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
113 |         a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
114 |     CUDA_RT_CALL(cudaGetLastError());
115 | }
116 | 


--------------------------------------------------------------------------------
/nccl_overlap/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPICXX=mpicxx
 5 | MPIRUN ?= mpirun
 6 | CUDA_HOME ?= /usr/local/cuda
 7 | NCCL_HOME ?= /usr
 8 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 9 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
10 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
11 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
12 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
13 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
14 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
15 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
16 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
17 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
18 | ifdef DISABLE_CUB
19 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
20 | else
21 |         NVCC_FLAGS = -DHAVE_CUB
22 | endif
23 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
24 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14
25 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib -lcudart -ldl -lnccl
26 | jacobi: Makefile jacobi.cpp jacobi_kernels.o
27 | 	$(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi
28 | 
29 | jacobi_kernels.o: Makefile jacobi_kernels.cu
30 | 	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
31 | 
32 | .PHONY.: clean
33 | clean:
34 | 	rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log
35 | 
36 | sanitize: jacobi
37 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
38 | 
39 | run: jacobi
40 | 	$(MPIRUN) -np $(NP) ./jacobi
41 | 
42 | profile: jacobi
43 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
44 | 


--------------------------------------------------------------------------------
/nccl_overlap/jacobi_kernels.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <cstdio>
 28 | #include <cstdlib>
 29 | 
 30 | #ifdef HAVE_CUB
 31 | #include <cub/block/block_reduce.cuh>
 32 | #endif  // HAVE_CUB
 33 | 
 34 | #define CUDA_RT_CALL(call)                                                                  \
 35 |     {                                                                                       \
 36 |         cudaError_t cudaStatus = call;                                                      \
 37 |         if (cudaSuccess != cudaStatus) {                                                    \
 38 |             fprintf(stderr,                                                                 \
 39 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 40 |                     "with "                                                                 \
 41 |                     "%s (%d).\n",                                                           \
 42 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 43 |             exit( cudaStatus );                                                             \
 44 |         }                                                                                   \
 45 |     }
 46 | 
 47 | #ifdef USE_DOUBLE
 48 | typedef double real;
 49 | #define MPI_REAL_TYPE MPI_DOUBLE
 50 | #else
 51 | typedef float real;
 52 | #define MPI_REAL_TYPE MPI_FLOAT
 53 | #endif
 54 | 
 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 56 |                                       const real pi, const int offset, const int nx,
 57 |                                       const int my_ny, const int ny) {
 58 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
 59 |         const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
 60 |         a[iy * nx + 0] = y0;
 61 |         a[iy * nx + (nx - 1)] = y0;
 62 |         a_new[iy * nx + 0] = y0;
 63 |         a_new[iy * nx + (nx - 1)] = y0;
 64 |     }
 65 | }
 66 | 
 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 68 |                                   const real pi, const int offset, const int nx, const int my_ny,
 69 |                                   const int ny) {
 70 |     initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
 71 |     CUDA_RT_CALL(cudaGetLastError());
 72 | }
 73 | 
 74 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
 76 |                               real* __restrict__ const l2_norm, const int iy_start,
 77 |                               const int iy_end, const int nx, const bool calculate_norm) {
 78 | #ifdef HAVE_CUB
 79 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
 80 |         BlockReduce;
 81 |     __shared__ typename BlockReduce::TempStorage temp_storage;
 82 | #endif  // HAVE_CUB
 83 |     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
 84 |     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
 85 |     real local_l2_norm = 0.0;
 86 | 
 87 |     if (iy < iy_end && ix < (nx - 1)) {
 88 |         const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
 89 |                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
 90 |         a_new[iy * nx + ix] = new_val;
 91 |         if (calculate_norm) {
 92 |             real residue = new_val - a[iy * nx + ix];
 93 |             local_l2_norm += residue * residue;
 94 |         }
 95 |     }
 96 |     if (calculate_norm) {
 97 | #ifdef HAVE_CUB
 98 |         real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
 99 |         if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
100 | #else
101 |         atomicAdd(l2_norm, local_l2_norm);
102 | #endif  // HAVE_CUB
103 |     }
104 | }
105 | 
106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
107 |                           real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
108 |                           const int nx, const bool calculate_norm, cudaStream_t stream) {
109 |     constexpr int dim_block_x = 32;
110 |     constexpr int dim_block_y = 32;
111 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
112 |                   ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
113 |     jacobi_kernel<dim_block_x, dim_block_y><<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, stream>>>(
114 |         a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm);
115 |     CUDA_RT_CALL(cudaGetLastError());
116 | }
117 | 


--------------------------------------------------------------------------------
/nvshmem/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 | NP ?= 1
 3 | NVCC=nvcc
 4 | MPIRUN ?= mpirun
 5 | CUDA_HOME ?= /usr/local/cuda
 6 | ifndef NVSHMEM_HOME
 7 | $(error NVSHMEM_HOME is not set)
 8 | endif
 9 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
10 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
11 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
12 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
13 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
14 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
15 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
16 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
17 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
18 | ifdef USE_LTO
19 | 	GENCODE_SM70 += -gencode arch=compute_70,code=lto_70
20 | 	GENCODE_SM80 += -gencode arch=compute_80,code=lto_80
21 | 	GENCODE_SM90 += -gencode arch=compute_90,code=lto_90
22 | endif
23 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
24 | 
25 | ifdef DISABLE_CUB
26 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
27 | else
28 |         NVCC_FLAGS = -DHAVE_CUB
29 | endif
30 | NVCC_FLAGS += -ccbin=mpic++ -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include
31 | NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(CUDA_HOME)/lib64 -lcuda -lcudart -ldl -lnvidia-ml
32 | ifdef USE_LTO
33 | 	NVCC_FLAGS += -maxrregcount=32
34 | 	NVCC_LDFLAGS += -maxrregcount=32 -dlto
35 | endif
36 | jacobi: Makefile jacobi.cu
37 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o
38 | 	$(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS)
39 | 
40 | .PHONY.: clean
41 | clean:
42 | 	rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log
43 | 
44 | sanitize: jacobi
45 | 	$(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10
46 | 
47 | run: jacobi
48 | 	$(MPIRUN) -np $(NP) ./jacobi
49 | 
50 | profile: jacobi
51 | 	$(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10
52 | 


--------------------------------------------------------------------------------
/single_gpu/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/single_gpu/jacobi.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | #include <algorithm>
 28 | #include <array>
 29 | #include <climits>
 30 | #include <cmath>
 31 | #include <cstdio>
 32 | #include <iostream>
 33 | #include <iterator>
 34 | #include <sstream>
 35 | #include <cstdlib>
 36 | 
 37 | #include <omp.h>
 38 | 
 39 | #ifdef HAVE_CUB
 40 | #include <cub/block/block_reduce.cuh>
 41 | #endif  // HAVE_CUB
 42 | 
 43 | #ifdef USE_NVTX
 44 | #include <nvtx3/nvToolsExt.h>
 45 | 
 46 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
 47 |                            0x0000ffff, 0x00ff0000, 0x00ffffff};
 48 | const int num_colors = sizeof(colors) / sizeof(uint32_t);
 49 | 
 50 | #define PUSH_RANGE(name, cid)                              \
 51 |     {                                                      \
 52 |         int color_id = cid;                                \
 53 |         color_id = color_id % num_colors;                  \
 54 |         nvtxEventAttributes_t eventAttrib = {0};           \
 55 |         eventAttrib.version = NVTX_VERSION;                \
 56 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
 57 |         eventAttrib.colorType = NVTX_COLOR_ARGB;           \
 58 |         eventAttrib.color = colors[color_id];              \
 59 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
 60 |         eventAttrib.message.ascii = name;                  \
 61 |         nvtxRangePushEx(&eventAttrib);                     \
 62 |     }
 63 | #define POP_RANGE nvtxRangePop();
 64 | #else
 65 | #define PUSH_RANGE(name, cid)
 66 | #define POP_RANGE
 67 | #endif
 68 | 
 69 | #define CUDA_RT_CALL(call)                                                                  \
 70 |     {                                                                                       \
 71 |         cudaError_t cudaStatus = call;                                                      \
 72 |         if (cudaSuccess != cudaStatus) {                                                    \
 73 |             fprintf(stderr,                                                                 \
 74 |                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
 75 |                     "with "                                                                 \
 76 |                     "%s (%d).\n",                                                           \
 77 |                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
 78 |             exit( cudaStatus );                                                             \
 79 |         }                                                                                   \
 80 |     }
 81 | 
 82 | typedef float real;
 83 | constexpr real tol = 1.0e-8;
 84 | 
 85 | const real PI = 2.0 * std::asin(1.0);
 86 | 
 87 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
 88 |                                       const real pi, const int nx, const int ny) {
 89 |     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < ny; iy += blockDim.x * gridDim.x) {
 90 |         const real y0 = sin(2.0 * pi * iy / (ny - 1));
 91 |         a[iy * nx + 0] = y0;
 92 |         a[iy * nx + (nx - 1)] = y0;
 93 |         a_new[iy * nx + 0] = y0;
 94 |         a_new[iy * nx + (nx - 1)] = y0;
 95 |     }
 96 | }
 97 | 
 98 | template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
 99 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
100 |                               real* __restrict__ const l2_norm, const int iy_start,
101 |                               const int iy_end, const int nx) {
102 | #ifdef HAVE_CUB
103 |     typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
104 |         BlockReduce;
105 |     __shared__ typename BlockReduce::TempStorage temp_storage;
106 | #endif  // HAVE_CUB
107 |     const int iy = blockIdx.y * blockDim.y + threadIdx.y + 1;
108 |     const int ix = blockIdx.x * blockDim.x + threadIdx.x;
109 |     real local_l2_norm = 0.0;
110 | 
111 |     if (iy < iy_end) {
112 |         if (ix >= 1 && ix < (nx - 1)) {
113 |             const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
114 |                                          a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
115 |             a_new[iy * nx + ix] = new_val;
116 | 
117 |             // apply boundary conditions
118 |             if (iy_start == iy) {
119 |                 a_new[iy_end * nx + ix] = new_val;
120 |             }
121 | 
122 |             if ((iy_end - 1) == iy) {
123 |                 a_new[(iy_start - 1) * nx + ix] = new_val;
124 |             }
125 | 
126 |             real residue = new_val - a[iy * nx + ix];
127 |             local_l2_norm = residue * residue;
128 |         }
129 |     }
130 | #ifdef HAVE_CUB
131 |     real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
132 |     if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
133 | #else
134 |     atomicAdd(l2_norm, local_l2_norm);
135 | #endif  // HAVE_CUB
136 | }
137 | 
138 | double noopt(const int nx, const int ny, const int iter_max, real* const a_ref_h, const int nccheck,
139 |              const bool print);
140 | 
141 | template <typename T>
142 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
143 |     T argval = default_val;
144 |     char** itr = std::find(begin, end, arg);
145 |     if (itr != end && ++itr != end) {
146 |         std::istringstream inbuf(*itr);
147 |         inbuf >> argval;
148 |     }
149 |     return argval;
150 | }
151 | 
152 | bool get_arg(char** begin, char** end, const std::string& arg) {
153 |     char** itr = std::find(begin, end, arg);
154 |     if (itr != end) {
155 |         return true;
156 |     }
157 |     return false;
158 | }
159 | 
160 | struct l2_norm_buf {
161 |     cudaEvent_t copy_done;
162 |     real* d;
163 |     real* h;
164 | };
165 | 
166 | int main(int argc, char* argv[]) {
167 |     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
168 |     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
169 |     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
170 |     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
171 |     const bool csv = get_arg(argv, argv + argc, "-csv");
172 | 
173 |     if (nccheck != 1) {
174 |         fprintf(stderr, "Only nccheck = 1 is supported\n");
175 |         return -1;
176 |     }
177 | 
178 |     real* a;
179 |     real* a_new;
180 | 
181 |     cudaStream_t compute_stream;
182 |     cudaStream_t copy_l2_norm_stream;
183 |     cudaStream_t reset_l2_norm_stream;
184 | 
185 |     cudaEvent_t compute_done;
186 |     cudaEvent_t reset_l2_norm_done[2];
187 | 
188 |     real l2_norms[2];
189 |     l2_norm_buf l2_norm_bufs[2];
190 | 
191 |     int iy_start = 1;
192 |     int iy_end = (ny - 1);
193 | 
194 |     CUDA_RT_CALL(cudaSetDevice(0));
195 |     CUDA_RT_CALL(cudaFree(0));
196 | 
197 |     CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
198 |     CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
199 | 
200 |     CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
201 |     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
202 | 
203 |     // Set diriclet boundary conditions on left and right boarder
204 |     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, nx, ny);
205 |     CUDA_RT_CALL(cudaGetLastError());
206 |     CUDA_RT_CALL(cudaDeviceSynchronize());
207 | 
208 |     CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
209 |     CUDA_RT_CALL(cudaStreamCreate(&copy_l2_norm_stream));
210 |     CUDA_RT_CALL(cudaStreamCreate(&reset_l2_norm_stream));
211 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
212 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2_norm_done[0], cudaEventDisableTiming));
213 |     CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2_norm_done[1], cudaEventDisableTiming));
214 | 
215 |     for (int i = 0; i < 2; ++i) {
216 |         CUDA_RT_CALL(cudaEventCreateWithFlags(&l2_norm_bufs[i].copy_done, cudaEventDisableTiming));
217 |         CUDA_RT_CALL(cudaMalloc(&l2_norm_bufs[i].d, sizeof(real)));
218 |         CUDA_RT_CALL(cudaMemset(l2_norm_bufs[i].d, 0, sizeof(real)));
219 |         CUDA_RT_CALL(cudaMallocHost(&l2_norm_bufs[i].h, sizeof(real)));
220 |         (*l2_norm_bufs[i].h) = 1.0;
221 |     }
222 | 
223 |     CUDA_RT_CALL(cudaDeviceSynchronize());
224 | 
225 |     if (!csv)
226 |         printf(
227 |             "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
228 |             "every %d iterations\n",
229 |             iter_max, ny, nx, nccheck);
230 | 
231 |     constexpr int dim_block_x = 32;
232 |     constexpr int dim_block_y = 32;
233 |     dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
234 | 
235 |     int iter = 0;
236 |     for (int i = 0; i < 2; ++i) {
237 |         l2_norms[i] = 0.0;
238 |     }
239 | 
240 |     double start = omp_get_wtime();
241 | 
242 |     PUSH_RANGE("Jacobi solve", 0)
243 | 
244 |     bool l2_norm_greater_than_tol = true;
245 |     while (l2_norm_greater_than_tol && iter < iter_max) {
246 |         // on new iteration: old current vars are now previous vars, old
247 |         // previous vars are no longer needed
248 |         int prev = iter % 2;
249 |         int curr = (iter + 1) % 2;
250 | 
251 |         // wait for memset from old previous iteration to complete
252 |         CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, reset_l2_norm_done[curr], 0));
253 | 
254 |         jacobi_kernel<dim_block_x, dim_block_y>
255 |             <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream>>>(
256 |                 a_new, a, l2_norm_bufs[curr].d, iy_start, iy_end, nx);
257 |         CUDA_RT_CALL(cudaGetLastError());
258 |         CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
259 | 
260 |         // perform L2 norm calculation
261 |         if ((iter % nccheck) == 0 || (!csv && (iter % 100) == 0)) {
262 |             CUDA_RT_CALL(cudaStreamWaitEvent(copy_l2_norm_stream, compute_done, 0));
263 |             CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[curr].h, l2_norm_bufs[curr].d, sizeof(real),
264 |                                          cudaMemcpyDeviceToHost, copy_l2_norm_stream));
265 |             CUDA_RT_CALL(cudaEventRecord(l2_norm_bufs[curr].copy_done, copy_l2_norm_stream));
266 | 
267 |             // make sure D2H copy is complete before using the data for
268 |             // calculation
269 |             CUDA_RT_CALL(cudaEventSynchronize(l2_norm_bufs[prev].copy_done));
270 | 
271 |             l2_norms[prev] = *(l2_norm_bufs[prev].h);
272 |             l2_norms[prev] = std::sqrt(l2_norms[prev]);
273 |             l2_norm_greater_than_tol = (l2_norms[prev] > tol);
274 | 
275 |             if (!csv && (iter % 100) == 0) {
276 |                 printf("%5d, %0.6f\n", iter, l2_norms[prev]);
277 |             }
278 | 
279 |             // reset everything for next iteration
280 |             l2_norms[prev] = 0.0;
281 |             *(l2_norm_bufs[prev].h) = 0.0;
282 |             CUDA_RT_CALL(
283 |                 cudaMemsetAsync(l2_norm_bufs[prev].d, 0, sizeof(real), reset_l2_norm_stream));
284 |             CUDA_RT_CALL(cudaEventRecord(reset_l2_norm_done[prev], reset_l2_norm_stream));
285 |         }
286 | 
287 |         std::swap(a_new, a);
288 |         iter++;
289 |     }
290 |     CUDA_RT_CALL(cudaDeviceSynchronize());
291 |     POP_RANGE
292 |     double stop = omp_get_wtime();
293 | 
294 |     if (csv) {
295 |         printf("single_gpu, %d, %d, %d, %d, %f\n", nx, ny, iter_max, nccheck, (stop - start));
296 |     } else {
297 |         printf("%dx%d: 1 GPU: %8.4f s\n", ny, nx, (stop - start));
298 |     }
299 | 
300 |     for (int i = 0; i < 2; ++i) {
301 |         CUDA_RT_CALL(cudaFreeHost(l2_norm_bufs[i].h));
302 |         CUDA_RT_CALL(cudaFree(l2_norm_bufs[i].d));
303 |         CUDA_RT_CALL(cudaEventDestroy(l2_norm_bufs[i].copy_done));
304 |     }
305 | 
306 |     CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[1]));
307 |     CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[0]));
308 |     CUDA_RT_CALL(cudaEventDestroy(compute_done));
309 | 
310 |     CUDA_RT_CALL(cudaStreamDestroy(reset_l2_norm_stream));
311 |     CUDA_RT_CALL(cudaStreamDestroy(copy_l2_norm_stream));
312 |     CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
313 | 
314 |     CUDA_RT_CALL(cudaFree(a_new));
315 |     CUDA_RT_CALL(cudaFree(a));
316 | 
317 |     return 0;
318 | }
319 | 


--------------------------------------------------------------------------------
/single_threaded_copy/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 2 | NVCC=nvcc
 3 | GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
 4 | GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
 5 | GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
 6 | GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
 7 | GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
 8 | GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 9 | GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
10 | GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
11 | GENCODE_SM90    := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
12 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90)
13 | ifdef DISABLE_CUB
14 |         NVCC_FLAGS = -Xptxas --optimize-float-atomics
15 | else
16 |         NVCC_FLAGS = -DHAVE_CUB
17 | endif
18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14
19 | jacobi: Makefile jacobi.cu
20 | 	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
21 | 
22 | .PHONY.: clean
23 | clean:
24 | 	rm -f jacobi jacobi.nsys-rep
25 | 
26 | sanitize: jacobi
27 | 	compute-sanitizer ./jacobi -niter 10
28 | 
29 | run: jacobi
30 | 	./jacobi
31 | 
32 | profile: jacobi
33 | 	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10
34 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2017,2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | now=`date +"%Y%m%d%H%M%S"`
29 | LOG="test-${now}.log"
30 | 
31 | if [ -v HPCSDK_RELEASE ]; then
32 |     echo "Running with NVIDIA HPC SDK"
33 | 
34 |     if [ ! -v CUDA_HOME ] || [ ! -d ${CUDA_HOME} ]; then
35 |         export CUDA_HOME=$(nvc++ -cuda -printcudaversion |& grep "CUDA Path" | awk -F '=' '{print $2}')
36 |         echo "Setting CUDA_HOME=${CUDA_HOME}"
37 |     fi 
38 | 
39 |     if [ ! -v NCCL_HOME ] || [ ! -d ${NCCL_HOME} ]; then
40 |         export NCCL_HOME=$(dirname `echo $LD_LIBRARY_PATH | tr ':' '\n' | grep nccl | grep -v sharp`)
41 |         echo "Setting NCCL_HOME=${NCCL_HOME}"
42 |     fi 
43 | 
44 |     if [ ! -v NVSHMEM_HOME ] || [ ! -d ${NVSHMEM_HOME} ]; then
45 |         export NVSHMEM_HOME=$(dirname `echo $LD_LIBRARY_PATH | tr ':' '\n' | grep nvshmem`)
46 |         echo "Setting NVSHMEM_HOME=${NVSHMEM_HOME}"
47 |     fi
48 | fi
49 | 
50 | if [ -e ${LOG} ]; then
51 |   echo "ERROR log file ${LOG} already exists."
52 |   exit 1
53 | fi
54 | 
55 | #DGX-1V
56 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,3" "0,3,2" "0,3,2,1" "3,2,1,5,7" "0,3,2,1,5,4" "0,4,7,6,5,1,2" "0,3,2,1,5,6,7,4" )
57 | #DGX A100 and DGX H100
58 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
59 | 
60 | errors=0
61 | 
62 | for entry in `ls -1`; do
63 |     if [ -f ${entry}/Makefile ] ; then
64 |         if [ "run" == "$1" ] ; then
65 |             NUM_GPUS=`nvidia-smi -L | wc -l`
66 |             for (( NP=1; NP<=${NUM_GPUS}; NP++ )) ; do
67 |                 export NP
68 |                 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NP}]}
69 |                 CMD="make -C ${entry} $1"
70 |                 ${CMD} >> ${LOG} 2>&1
71 |                 if [ $? -ne 0 ]; then
72 |                     echo "ERROR with ${CMD} (NP = ${NP}, CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}) see ${LOG} for details."
73 |                     errors=1
74 |                     break
75 |                 fi
76 |             done
77 |         else
78 |             CMD="make -C ${entry} $1"
79 |             ${CMD} >> ${LOG} 2>&1
80 |             if [ $? -ne 0 ]; then
81 |                 echo "ERROR with ${CMD} see ${LOG} for details."
82 |                 errors=1
83 |                 break
84 |             fi
85 |         fi
86 |     fi
87 | done
88 | 
89 | if [ ${errors} -eq 0 ]; then
90 |     echo "Passed."
91 |     exit 0
92 | else
93 |     exit 1
94 | fi
95 | 


--------------------------------------------------------------------------------