├── .clang-format ├── .gitignore ├── LICENSE.md ├── README.md ├── bench.sbatch.sh ├── bench.sh ├── mpi ├── Makefile ├── jacobi.cpp └── jacobi_kernels.cu ├── mpi_overlap ├── Makefile ├── jacobi.cpp └── jacobi_kernels.cu ├── multi_node_p2p ├── Makefile ├── jacobi.cpp └── jacobi_kernels.cu ├── multi_threaded_copy ├── Makefile └── jacobi.cu ├── multi_threaded_copy_overlap ├── Makefile └── jacobi.cu ├── multi_threaded_p2p ├── Makefile └── jacobi.cu ├── multi_threaded_p2p_opt ├── Makefile └── jacobi.cu ├── multi_threaded_um ├── Makefile └── jacobi.cu ├── nccl ├── Makefile ├── jacobi.cpp └── jacobi_kernels.cu ├── nccl_graphs ├── Makefile ├── jacobi.cpp └── jacobi_kernels.cu ├── nccl_overlap ├── Makefile ├── jacobi.cpp └── jacobi_kernels.cu ├── nvshmem ├── Makefile └── jacobi.cu ├── single_gpu ├── Makefile └── jacobi.cu ├── single_threaded_copy ├── Makefile └── jacobi.cu └── test.sh /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeColon 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: false 45 | BreakConstructorInitializers: BeforeColon 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 100 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: true 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 4 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: google 126 | ReflowComments: true 127 | SortIncludes: true 128 | SortUsingDeclarations: true 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | TabWidth: 8 146 | UseTab: Never 147 | ... 148 | 149 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | jacobi 3 | test-*.log 4 | .vscode 5 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of NVIDIA CORPORATION nor the names of its 12 | contributors may be used to endorse or promote products derived 13 | from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi GPU Programming Models 2 | This project implements the well known multi GPU Jacobi solver with different multi GPU Programming Models: 3 | * `single_threaded_copy` Single Threaded using cudaMemcpy for inter GPU communication 4 | * `multi_threaded_copy` Multi Threaded with OpenMP using cudaMemcpy for inter GPU communication 5 | * `multi_threaded_copy_overlap` Multi Threaded with OpenMP using cudaMemcpy for inter GPU communication with overlapping communication 6 | * `multi_threaded_p2p` Multi Threaded with OpenMP using GPUDirect P2P mappings for inter GPU communication 7 | * `multi_threaded_p2p_opt` Multi Threaded with OpenMP using GPUDirect P2P mappings for inter GPU communication with delayed norm execution 8 | * `multi_threaded_um` Multi Threaded with OpenMP relying on transparent peer mappings with Unified Memory for inter GPU communication 9 | * `mpi` Multi Process with MPI using CUDA-aware MPI for inter GPU communication 10 | * `mpi_overlap` Multi Process with MPI using CUDA-aware MPI for inter GPU communication with overlapping communication 11 | * `nccl` Multi Process with MPI and NCCL using NCCL for inter GPU communication 12 | * `nccl_overlap` Multi Process with MPI and NCCL using NCCL for inter GPU communication with overlapping communication 13 | * `nccl_graphs` Multi Process with MPI and NCCL using NCCL for inter GPU communication with overlapping communication combined with CUDA Graphs 14 | * `nvshmem` Multi Process with MPI and NVSHMEM using NVSHMEM for inter GPU communication. 15 | * `multi_node_p2p` Multi Process Multi Node variant using the low level CUDA Driver [Virtual Memory Management](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-memory-management) and [Multicast Object Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MULTICAST.html#group__CUDA__MULTICAST) APIs. This example is for developers of libraries like NCCL or NVSHMEM. It shows how higher-level programming models like NVSHMEM work internally within a (multinode) NVLINK domain. Application developers generally should use the higher-level MPI, NCCL, or NVSHMEM interfaces instead of this API. 16 | 17 | Each variant is a stand alone `Makefile` project and most variants have been discussed in various GTC Talks, e.g.: 18 | * `single_threaded_copy`, `multi_threaded_copy`, `multi_threaded_copy_overlap`, `multi_threaded_p2p`, `multi_threaded_p2p_opt`, `mpi`, `mpi_overlap` and `nvshmem` on DGX-1V at GTC Europe 2017 in 23031 - Multi GPU Programming Models 19 | * `single_threaded_copy`, `multi_threaded_copy`, `multi_threaded_copy_overlap`, `multi_threaded_p2p`, `multi_threaded_p2p_opt`, `mpi`, `mpi_overlap` and `nvshmem` on DGX-2 at GTC 2019 in S9139 - Multi GPU Programming Models 20 | * `multi_threaded_copy`, `multi_threaded_copy_overlap`, `multi_threaded_p2p`, `multi_threaded_p2p_opt`, `mpi`, `mpi_overlap`, `nccl`, `nccl_overlap` and `nvshmem` on DGX A100 at GTC 2021 in [A31140 - Multi-GPU Programming Models](https://www.nvidia.com/en-us/on-demand/session/gtcfall21-a31140/) 21 | 22 | Some examples in this repository are the basis for an interactive tutorial: [FZJ-JSC/tutorial-multi-gpu](https://github.com/FZJ-JSC/tutorial-multi-gpu). 23 | 24 | # Requirements 25 | * CUDA: version 11.0 (9.2 if build with `DISABLE_CUB=1`) or later is required by all variants. 26 | * `nccl_graphs` requires NCCL 2.15.1, CUDA 11.7 and CUDA Driver 515.65.01 or newer 27 | * `multi_node_p2p` requires CUDA 12.4, a CUDA Driver 550.54.14 or newer and the NVIDIA IMEX daemon running. 28 | * OpenMP capable compiler: Required by the Multi Threaded variants. The examples have been developed and tested with gcc. 29 | * MPI: The `mpi` and `mpi_overlap` variants require a CUDA-aware[^1] implementation. For NVSHMEM, NCCL and `multi_node_p2p`, a non CUDA-aware MPI is sufficient. The examples have been developed and tested with OpenMPI. 30 | * NVSHMEM (version 0.4.1 or later): Required by the NVSHMEM variant. 31 | * NCCL (version 2.8 or later): Required by the NCCL variant 32 | 33 | # Building 34 | Each variant comes with a `Makefile` and can be built by simply issuing `make`, e.g. 35 | ```sh 36 | multi-gpu-programming-models$ cd multi_threaded_copy 37 | multi_threaded_copy$ make 38 | nvcc -DHAVE_CUB -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -std=c++14 jacobi.cu -o jacobi 39 | multi_threaded_copy$ ls jacobi 40 | jacobi 41 | ``` 42 | 43 | # Run instructions 44 | All variants have the following command line options 45 | * `-niter`: How many iterations to carry out (default 1000) 46 | * `-nccheck`: How often to check for convergence (default 1) 47 | * `-nx`: Size of the domain in x direction (default 16384) 48 | * `-ny`: Size of the domain in y direction (default 16384) 49 | * `-csv`: Print performance results as -csv 50 | * `-use_hp_streams`: In `mpi_overlap` use high priority streams to hide kernel launch latencies of boundary kernels. 51 | 52 | The `nvshmem` variant additionally provides 53 | * `-use_block_comm`: Use block cooperative `nvshmemx_float_put_nbi_block` instead of `nvshmem_float_p` for communication. 54 | * `-norm_overlap`: Enable delayed norm execution as also implemented in `multi_threaded_p2p_opt` 55 | * `-neighborhood_sync`: Use custom neighbor only sync instead of `nvshmemx_barrier_all_on_stream` 56 | 57 | The `multi_node_p2p` variant additionally provides 58 | * `-use_mc_red`: Use a device side barrier and allreduce leveraging Multicast Objects instead of MPI primitives 59 | 60 | The `nccl` variants additionally provide 61 | * `-user_buffer_reg`: Avoid extra internal copies in NCCL communication with [User Buffer Registration](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#user-buffer-registration). Required NCCL APIs are available with NCCL 2.19.1 or later. NCCL 2.23.4 added support for the used communication pattern. 62 | 63 | The provided script `bench.sh` contains some examples executing all the benchmarks presented in the GTC Talks referenced above. 64 | 65 | # Developers guide 66 | The code applies the style guide implemented in [`.clang-format`](.clang-format) file. [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html) version 7 or later should be used to format the code prior to submitting it. E.g. with 67 | ```sh 68 | multi-gpu-programming-models$ cd multi_threaded_copy 69 | multi_threaded_copy$ clang-format -style=file -i jacobi.cu 70 | ``` 71 | 72 | [^1]: A check for CUDA-aware support is done at compile and run time (see [the OpenMPI FAQ](https://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-aware-support) for details). If your CUDA-aware MPI implementation does not support this check, which requires `MPIX_CUDA_AWARE_SUPPORT` and `MPIX_Query_cuda_support()` to be defined in `mpi-ext.h`, it can be skipped by setting `SKIP_CUDA_AWARENESS_CHECK=1`. 73 | -------------------------------------------------------------------------------- /bench.sbatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p batch 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -t 02:00:00 7 | 8 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 9 | # 10 | # Redistribution and use in source and binary forms, with or without 11 | # modification, are permitted provided that the following conditions 12 | # are met: 13 | # * Redistributions of source code must retain the above copyright 14 | # notice, this list of conditions and the following disclaimer. 15 | # * Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # * Neither the name of NVIDIA CORPORATION nor the names of its 19 | # contributors may be used to endorse or promote products derived 20 | # from this software without specific prior written permission. 21 | # 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 23 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 26 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 30 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | 34 | : "${ENROOT_IMG_PATH:=.}" 35 | : "${LUSTRE:=.}" 36 | 37 | IMG=nvcr.io/nvidia/nvhpc:24.1-devel-cuda12.3-ubuntu22.04 38 | SQUASHFS_IMG=$ENROOT_IMG_PATH/`echo "$IMG" | md5sum | cut -f1 -d " "` 39 | CONTAINER_NAME=HPCSDK-CONTAINER 40 | 41 | CONTAINER_MNTS=$LUSTRE/workspace/multi-gpu-programming-models:/mnt 42 | 43 | start=`date` 44 | 45 | if [[ -f "$SQUASHFS_IMG" ]]; then 46 | echo "Using: $SQUASHFS_IMG" 47 | else 48 | echo "Fetching $IMG to $SQUASHFS_IMG" 49 | srun -n 1 -N 1 --ntasks-per-node=1 enroot import -o $SQUASHFS_IMG docker://$IMG 50 | echo "$IMG" > "${SQUASHFS_IMG}.url" 51 | fi 52 | 53 | CONTAINER_IMG=$SQUASHFS_IMG 54 | 55 | if [[ ! -f "$CONTAINER_IMG" ]]; then 56 | echo "Falling back to $IMG" 57 | CONTAINER_IMG=$IMG 58 | fi 59 | 60 | # Pulling container on all nodes 61 | srun -N ${SLURM_JOB_NUM_NODES} \ 62 | -n ${SLURM_JOB_NUM_NODES} \ 63 | --ntasks-per-node=1 \ 64 | --container-image=$CONTAINER_IMG \ 65 | --container-name=$CONTAINER_NAME \ 66 | true 67 | 68 | export SRUN_ARGS="--cpu-bind=none --mpi=none --no-container-remap-root --container-mounts=$CONTAINER_MNTS --container-workdir=/mnt --container-name=$CONTAINER_NAME" 69 | 70 | # HCOLL is not used silence HCOLL warnings when running on a node without a IB HCA 71 | export OMPI_MCA_coll_hcoll_enable=0 72 | 73 | export MPIRUN_ARGS="--oversubscribe" 74 | 75 | #rebuild executables 76 | srun $SRUN_ARGS -n 1 /bin/bash -c "./test.sh clean; sleep 1; ./test.sh" 77 | 78 | srun -n 1 /bin/bash -c "sudo nvidia-smi -lgc 1980,1980" 79 | 80 | srun $SRUN_ARGS -n 1 ./bench.sh 81 | 82 | srun $SRUN_ARGS -n 1 /bin/bash -c "nvidia-smi; modinfo gdrdrv; env; nvcc --version; mpicxx --version" 83 | 84 | -------------------------------------------------------------------------------- /bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | NREP=5 29 | NXNY="20480" 30 | 31 | #DGX-1V 32 | #CPUID=0-19 33 | #FIRST_CORE=0 34 | #MAX_NUM_GPUS=8 35 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,3" "0,3,2" "0,3,2,1" "3,2,1,5,7" "0,3,2,1,5,4" "0,4,7,6,5,1,2" "0,3,2,1,5,6,7,4" ) 36 | #MPI_CPU_BINDING_OPT=("--bind-to" "core" "--map-by" "core") 37 | 38 | #DGX-2 39 | #CPUID=0-23 40 | #FIRST_CORE=0 41 | #MAX_NUM_GPUS=16 42 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" "0,1,2,3,4,5,6,7,8" "0,1,2,3,4,5,6,7,8,9" "0,1,2,3,4,5,6,7,8,9,10" "0,1,2,3,4,5,6,7,8,9,10,11" "0,1,2,3,4,5,6,7,8,9,10,11,12" "0,1,2,3,4,5,6,7,8,9,10,11,12,13" "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14" "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15" ) 43 | #MPI_CPU_BINDING_OPT=("--bind-to" "core" "--map-by" "core") 44 | 45 | #DGX-A100 46 | #CPUID=48-63 47 | #FIRST_CORE=48 48 | #MAX_NUM_GPUS=8 49 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 50 | #MPI_CPU_BINDING_OPT=("--bind-to" "cpu-list:ordered" "--cpu-list" "48,49,50,51,52,53,54,55") 51 | 52 | #DGX-H100 53 | CPUID=0-55 54 | FIRST_CORE=0 55 | MAX_NUM_GPUS=8 56 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 57 | MPI_CPU_BINDING_OPT=("--bind-to" "core" "--map-by" "core") 58 | 59 | IFS=$'\n' 60 | function find_best () { 61 | declare -a RESULTS 62 | for ((i=0; i<$NREP; i++)); do 63 | RESULTS+=($("$@")) 64 | done 65 | printf '%s\n' "${RESULTS[@]}" | sort -k8 -b -t',' | head -1 66 | unset RESULTS 67 | } 68 | 69 | #Single GPU 70 | if true; then 71 | echo "type, nx, ny, iter_max, nccheck, runtime" 72 | export CUDA_VISIBLE_DEVICES="0" 73 | for (( nx=1024; nx <= 20*1024; nx+=1024 )); do 74 | find_best taskset -c ${CPUID} ./single_gpu/jacobi -csv -nx $nx -ny $nx 75 | done 76 | fi 77 | 78 | if false; then 79 | echo "type, nx, ny, iter_max, nccheck, runtime" 80 | export CUDA_VISIBLE_DEVICES="0" 81 | find_best taskset -c ${CPUID} ./single_gpu/jacobi -csv -nx ${NXNY} -ny ${NXNY} 82 | fi 83 | 84 | echo "type, nx, ny, iter_max, nccheck, num_devices, p2p, runtime, runtime_serial" 85 | 86 | #Single threaded copy - no P2P 87 | if false; then 88 | 89 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 90 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 91 | find_best taskset -c ${CPUID} ./single_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY} -nop2p 92 | done 93 | 94 | fi 95 | 96 | # Single threaded copy - P2P 97 | if false; then 98 | 99 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 100 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 101 | find_best taskset -c ${CPUID} ./single_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY} 102 | done 103 | 104 | fi 105 | 106 | #multi threaded copy without thread pinning 107 | if false; then 108 | 109 | export OMP_PROC_BIND=FALSE 110 | unset OMP_PLACES 111 | 112 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 113 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 114 | find_best ./multi_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY} 115 | done 116 | 117 | fi 118 | 119 | export OMP_PROC_BIND=TRUE 120 | 121 | #multi threaded copy 122 | if false; then 123 | 124 | NEXT_CORE=${FIRST_CORE} 125 | OMP_PLACES="{$((NEXT_CORE))}" 126 | NEXT_CORE=$((NEXT_CORE+1)) 127 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 128 | if (( NUM_GPUS > 1 )); then 129 | OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}" 130 | NEXT_CORE=$((NEXT_CORE+1)) 131 | fi 132 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 133 | export OMP_PLACES 134 | find_best ./multi_threaded_copy/jacobi -csv -nx ${NXNY} -ny ${NXNY} 135 | done 136 | 137 | fi 138 | 139 | #multi threaded copy overlap 140 | if false; then 141 | 142 | NEXT_CORE=${FIRST_CORE} 143 | OMP_PLACES="{$((NEXT_CORE))}" 144 | NEXT_CORE=$((NEXT_CORE+1)) 145 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 146 | if (( NUM_GPUS > 1 )); then 147 | OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}" 148 | NEXT_CORE=$((NEXT_CORE+1)) 149 | fi 150 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 151 | export OMP_PLACES 152 | find_best ./multi_threaded_copy_overlap/jacobi -csv -nx ${NXNY} -ny ${NXNY} 153 | done 154 | 155 | fi 156 | 157 | #multi threaded p2p 158 | if false; then 159 | 160 | NEXT_CORE=${FIRST_CORE} 161 | OMP_PLACES="{$((NEXT_CORE))}" 162 | NEXT_CORE=$((NEXT_CORE+1)) 163 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 164 | if (( NUM_GPUS > 1 )); then 165 | OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}" 166 | NEXT_CORE=$((NEXT_CORE+1)) 167 | fi 168 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 169 | export OMP_PLACES 170 | find_best ./multi_threaded_p2p/jacobi -csv -nx ${NXNY} -ny ${NXNY} 171 | done 172 | 173 | fi 174 | 175 | #multi threaded p2p with delayed check 176 | if false; then 177 | 178 | NEXT_CORE=${FIRST_CORE} 179 | OMP_PLACES="{$((NEXT_CORE))}" 180 | NEXT_CORE=$((NEXT_CORE+1)) 181 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 182 | if (( NUM_GPUS > 1 )); then 183 | OMP_PLACES="${OMP_PLACES},{$((NEXT_CORE))}" 184 | NEXT_CORE=$((NEXT_CORE+1)) 185 | fi 186 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 187 | export OMP_PLACES 188 | find_best ./multi_threaded_p2p_opt/jacobi -csv -nx ${NXNY} -ny ${NXNY} 189 | done 190 | 191 | fi 192 | 193 | if true; then 194 | 195 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 196 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 197 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./mpi/jacobi -csv -nx ${NXNY} -ny ${NXNY} 198 | done 199 | 200 | fi 201 | 202 | if true; then 203 | 204 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 205 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 206 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./mpi_overlap/jacobi -csv -nx ${NXNY} -ny ${NXNY} 207 | done 208 | 209 | fi 210 | 211 | if true; then 212 | 213 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 214 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 215 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./nccl/jacobi -csv -nx ${NXNY} -ny ${NXNY} 216 | done 217 | 218 | fi 219 | 220 | if true; then 221 | 222 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 223 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 224 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./nccl_overlap/jacobi -csv -nx ${NXNY} -ny ${NXNY} 225 | done 226 | 227 | fi 228 | 229 | if true; then 230 | 231 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 232 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 233 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES "${MPI_CPU_BINDING_OPT[@]}" ./nccl_graphs/jacobi -csv -nx ${NXNY} -ny ${NXNY} 234 | done 235 | 236 | fi 237 | 238 | if true; then 239 | 240 | export NVSHMEM_SYMMETRIC_SIZE=3690987520 241 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 242 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 243 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES -x NVSHMEM_SYMMETRIC_SIZE "${MPI_CPU_BINDING_OPT[@]}" ./nvshmem/jacobi -csv -nx ${NXNY} -ny ${NXNY} 244 | done 245 | 246 | export NVSHMEM_SYMMETRIC_SIZE=3690987520 247 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 248 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 249 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES -x NVSHMEM_SYMMETRIC_SIZE "${MPI_CPU_BINDING_OPT[@]}" ./nvshmem/jacobi -csv -neighborhood_sync -nx ${NXNY} -ny ${NXNY} 250 | done 251 | 252 | export NVSHMEM_SYMMETRIC_SIZE=3690987520 253 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 254 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 255 | find_best mpirun ${MPIRUN_ARGS} -np ${NUM_GPUS} -x CUDA_VISIBLE_DEVICES -x NVSHMEM_SYMMETRIC_SIZE "${MPI_CPU_BINDING_OPT[@]}" ./nvshmem/jacobi -csv -neighborhood_sync -norm_overlap -nx ${NXNY} -ny ${NXNY} 256 | done 257 | 258 | fi 259 | -------------------------------------------------------------------------------- /mpi/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPICXX=mpicxx 5 | MPIRUN ?= mpirun 6 | CUDA_HOME ?= /usr/local/cuda 7 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 8 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 9 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 10 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 11 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 12 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 13 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 14 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 15 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 16 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 17 | ifdef DISABLE_CUB 18 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 19 | else 20 | NVCC_FLAGS = -DHAVE_CUB 21 | endif 22 | ifdef SKIP_CUDA_AWARENESS_CHECK 23 | MPICXX_FLAGS = -DSKIP_CUDA_AWARENESS_CHECK 24 | endif 25 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 26 | MPICXX_FLAGS += -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14 27 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -ldl 28 | jacobi: Makefile jacobi.cpp jacobi_kernels.o 29 | $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi 30 | 31 | jacobi_kernels.o: Makefile jacobi_kernels.cu 32 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 33 | 34 | .PHONY.: clean 35 | clean: 36 | rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log 37 | 38 | sanitize: jacobi 39 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 40 | 41 | run: jacobi 42 | $(MPIRUN) -np $(NP) ./jacobi 43 | 44 | profile: jacobi 45 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 46 | -------------------------------------------------------------------------------- /mpi/jacobi.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #ifndef SKIP_CUDA_AWARENESS_CHECK 37 | #include 38 | #if !defined(MPIX_CUDA_AWARE_SUPPORT) || !MPIX_CUDA_AWARE_SUPPORT 39 | #error "The used MPI Implementation does not have CUDA-aware support or CUDA-aware \ 40 | support can't be determined. Define SKIP_CUDA_AWARENESS_CHECK to skip this check." 41 | #endif 42 | #endif 43 | 44 | #define MPI_CALL(call) \ 45 | { \ 46 | int mpi_status = call; \ 47 | if (MPI_SUCCESS != mpi_status) { \ 48 | char mpi_error_string[MPI_MAX_ERROR_STRING]; \ 49 | int mpi_error_string_length = 0; \ 50 | MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \ 51 | if (NULL != mpi_error_string) \ 52 | fprintf(stderr, \ 53 | "ERROR: MPI call \"%s\" in line %d of file %s failed " \ 54 | "with %s " \ 55 | "(%d).\n", \ 56 | #call, __LINE__, __FILE__, mpi_error_string, mpi_status); \ 57 | else \ 58 | fprintf(stderr, \ 59 | "ERROR: MPI call \"%s\" in line %d of file %s failed " \ 60 | "with %d.\n", \ 61 | #call, __LINE__, __FILE__, mpi_status); \ 62 | exit( mpi_status ); \ 63 | } \ 64 | } 65 | 66 | #include 67 | 68 | #ifdef USE_NVTX 69 | #include 70 | 71 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 72 | 0x0000ffff, 0x00ff0000, 0x00ffffff}; 73 | const int num_colors = sizeof(colors) / sizeof(uint32_t); 74 | 75 | #define PUSH_RANGE(name, cid) \ 76 | { \ 77 | int color_id = cid; \ 78 | color_id = color_id % num_colors; \ 79 | nvtxEventAttributes_t eventAttrib = {0}; \ 80 | eventAttrib.version = NVTX_VERSION; \ 81 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 82 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 83 | eventAttrib.color = colors[color_id]; \ 84 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 85 | eventAttrib.message.ascii = name; \ 86 | nvtxRangePushEx(&eventAttrib); \ 87 | } 88 | #define POP_RANGE nvtxRangePop(); 89 | #else 90 | #define PUSH_RANGE(name, cid) 91 | #define POP_RANGE 92 | #endif 93 | 94 | #define CUDA_RT_CALL(call) \ 95 | { \ 96 | cudaError_t cudaStatus = call; \ 97 | if (cudaSuccess != cudaStatus) { \ 98 | fprintf(stderr, \ 99 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 100 | "with " \ 101 | "%s (%d).\n", \ 102 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 103 | exit( cudaStatus ); \ 104 | } \ 105 | } 106 | 107 | #ifdef USE_DOUBLE 108 | typedef double real; 109 | #define MPI_REAL_TYPE MPI_DOUBLE 110 | #else 111 | typedef float real; 112 | #define MPI_REAL_TYPE MPI_FLOAT 113 | #endif 114 | 115 | constexpr real tol = 1.0e-8; 116 | 117 | const real PI = 2.0 * std::asin(1.0); 118 | 119 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 120 | const real pi, const int offset, const int nx, const int my_ny, 121 | const int ny); 122 | 123 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 124 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 125 | const int nx, const bool calculate_norm, cudaStream_t stream); 126 | 127 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 128 | const int nccheck, const bool print); 129 | 130 | template 131 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { 132 | T argval = default_val; 133 | char** itr = std::find(begin, end, arg); 134 | if (itr != end && ++itr != end) { 135 | std::istringstream inbuf(*itr); 136 | inbuf >> argval; 137 | } 138 | return argval; 139 | } 140 | 141 | bool get_arg(char** begin, char** end, const std::string& arg) { 142 | char** itr = std::find(begin, end, arg); 143 | if (itr != end) { 144 | return true; 145 | } 146 | return false; 147 | } 148 | 149 | int main(int argc, char* argv[]) { 150 | MPI_CALL(MPI_Init(&argc, &argv)); 151 | #if !defined(SKIP_CUDA_AWARENESS_CHECK) && defined(MPIX_CUDA_AWARE_SUPPORT) 152 | if (1 != MPIX_Query_cuda_support()) { 153 | fprintf(stderr, "The used MPI Implementation does not have CUDA-aware support enabled!\n"); 154 | MPI_CALL(MPI_Finalize()); 155 | return -1; 156 | } 157 | #endif 158 | int rank; 159 | MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); 160 | int size; 161 | MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size)); 162 | int num_devices = 0; 163 | CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); 164 | 165 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 166 | const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); 167 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 168 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 169 | const bool csv = get_arg(argv, argv + argc, "-csv"); 170 | 171 | int local_rank = -1; 172 | { 173 | MPI_Comm local_comm; 174 | MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, 175 | &local_comm)); 176 | 177 | MPI_CALL(MPI_Comm_rank(local_comm, &local_rank)); 178 | 179 | MPI_CALL(MPI_Comm_free(&local_comm)); 180 | } 181 | 182 | CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices)); 183 | CUDA_RT_CALL(cudaFree(0)); 184 | 185 | real* a_ref_h; 186 | CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); 187 | real* a_h; 188 | CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); 189 | double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank)); 190 | 191 | // ny - 2 rows are distributed amongst `size` ranks in such a way 192 | // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. 193 | // This optimizes load balancing when (ny - 2) % size != 0 194 | int chunk_size; 195 | int chunk_size_low = (ny - 2) / size; 196 | int chunk_size_high = chunk_size_low + 1; 197 | // To calculate the number of ranks that need to compute an extra row, 198 | // the following formula is derived from this equation: 199 | // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 200 | int num_ranks_low = size * chunk_size_low + size - 201 | (ny - 2); // Number of ranks with chunk_size = chunk_size_low 202 | if (rank < num_ranks_low) 203 | chunk_size = chunk_size_low; 204 | else 205 | chunk_size = chunk_size_high; 206 | 207 | real* a; 208 | CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); 209 | real* a_new; 210 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); 211 | 212 | CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); 213 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); 214 | 215 | // Calculate local domain boundaries 216 | int iy_start_global; // My start index in the global array 217 | if (rank < num_ranks_low) { 218 | iy_start_global = rank * chunk_size_low + 1; 219 | } else { 220 | iy_start_global = 221 | num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1; 222 | } 223 | int iy_end_global = iy_start_global + chunk_size - 1; // My last index in the global array 224 | 225 | int iy_start = 1; 226 | int iy_end = iy_start + chunk_size; 227 | 228 | // Set diriclet boundary conditions on left and right boarder 229 | launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny); 230 | CUDA_RT_CALL(cudaDeviceSynchronize()); 231 | 232 | cudaStream_t compute_stream; 233 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 234 | cudaEvent_t compute_done; 235 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 236 | 237 | real* l2_norm_d; 238 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 239 | real* l2_norm_h; 240 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 241 | 242 | PUSH_RANGE("MPI_Warmup", 5) 243 | for (int i = 0; i < 10; ++i) { 244 | const int top = rank > 0 ? rank - 1 : (size - 1); 245 | const int bottom = (rank + 1) % size; 246 | MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, 247 | a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, 248 | MPI_STATUS_IGNORE)); 249 | MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, 250 | MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); 251 | std::swap(a_new, a); 252 | } 253 | POP_RANGE 254 | 255 | CUDA_RT_CALL(cudaDeviceSynchronize()); 256 | 257 | if (!csv && 0 == rank) { 258 | printf( 259 | "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " 260 | "every %d iterations\n", 261 | iter_max, ny, nx, nccheck); 262 | } 263 | 264 | int iter = 0; 265 | real l2_norm = 1.0; 266 | bool calculate_norm = true; // boolean to store whether l2 norm will be calculated in 267 | // an iteration or not 268 | 269 | MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); 270 | double start = MPI_Wtime(); 271 | PUSH_RANGE("Jacobi solve", 0) 272 | while (l2_norm > tol && iter < iter_max) { 273 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 274 | 275 | calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); 276 | 277 | launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm, 278 | compute_stream); 279 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 280 | 281 | if (calculate_norm) { 282 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 283 | compute_stream)); 284 | } 285 | 286 | const int top = rank > 0 ? rank - 1 : (size - 1); 287 | const int bottom = (rank + 1) % size; 288 | 289 | // Apply periodic boundary conditions 290 | CUDA_RT_CALL(cudaEventSynchronize(compute_done)); 291 | PUSH_RANGE("MPI", 5) 292 | MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, 293 | a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, 294 | MPI_STATUS_IGNORE)); 295 | MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, 296 | MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); 297 | POP_RANGE 298 | 299 | if (calculate_norm) { 300 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 301 | MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD)); 302 | l2_norm = std::sqrt(l2_norm); 303 | 304 | if (!csv && 0 == rank && (iter % 100) == 0) { 305 | printf("%5d, %0.6f\n", iter, l2_norm); 306 | } 307 | } 308 | 309 | std::swap(a_new, a); 310 | iter++; 311 | } 312 | double stop = MPI_Wtime(); 313 | POP_RANGE 314 | 315 | CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx, 316 | std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), 317 | cudaMemcpyDeviceToHost)); 318 | 319 | int result_correct = 1; 320 | for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) { 321 | for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { 322 | if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { 323 | fprintf(stderr, 324 | "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f " 325 | "(reference)\n", 326 | rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); 327 | result_correct = 0; 328 | } 329 | } 330 | } 331 | 332 | int global_result_correct = 1; 333 | MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, 334 | MPI_COMM_WORLD)); 335 | result_correct = global_result_correct; 336 | 337 | if (rank == 0 && result_correct) { 338 | if (csv) { 339 | printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, 340 | (stop - start), runtime_serial); 341 | } else { 342 | printf("Num GPUs: %d.\n", size); 343 | printf( 344 | "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, " 345 | "efficiency: %8.2f \n", 346 | ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start), 347 | runtime_serial / (size * (stop - start)) * 100); 348 | } 349 | } 350 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 351 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 352 | 353 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 354 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 355 | 356 | CUDA_RT_CALL(cudaFree(a_new)); 357 | CUDA_RT_CALL(cudaFree(a)); 358 | 359 | CUDA_RT_CALL(cudaFreeHost(a_h)); 360 | CUDA_RT_CALL(cudaFreeHost(a_ref_h)); 361 | 362 | MPI_CALL(MPI_Finalize()); 363 | return (result_correct == 1) ? 0 : 1; 364 | } 365 | 366 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 367 | const int nccheck, const bool print) { 368 | real* a; 369 | real* a_new; 370 | 371 | cudaStream_t compute_stream; 372 | cudaStream_t push_top_stream; 373 | cudaStream_t push_bottom_stream; 374 | cudaEvent_t compute_done; 375 | cudaEvent_t push_top_done; 376 | cudaEvent_t push_bottom_done; 377 | 378 | real* l2_norm_d; 379 | real* l2_norm_h; 380 | 381 | int iy_start = 1; 382 | int iy_end = (ny - 1); 383 | 384 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); 385 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); 386 | 387 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 388 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 389 | 390 | // Set diriclet boundary conditions on left and right boarder 391 | launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny); 392 | CUDA_RT_CALL(cudaDeviceSynchronize()); 393 | 394 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 395 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 396 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 397 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 398 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 399 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 400 | 401 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 402 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 403 | 404 | CUDA_RT_CALL(cudaDeviceSynchronize()); 405 | 406 | if (print) 407 | printf( 408 | "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " 409 | "norm " 410 | "check every %d iterations\n", 411 | iter_max, ny, nx, nccheck); 412 | 413 | int iter = 0; 414 | real l2_norm = 1.0; 415 | bool calculate_norm = true; 416 | 417 | double start = MPI_Wtime(); 418 | PUSH_RANGE("Jacobi solve", 0) 419 | while (l2_norm > tol && iter < iter_max) { 420 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 421 | 422 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); 423 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); 424 | 425 | calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0; 426 | launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm, 427 | compute_stream); 428 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 429 | 430 | if (calculate_norm) { 431 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 432 | compute_stream)); 433 | } 434 | 435 | // Apply periodic boundary conditions 436 | 437 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); 438 | CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), 439 | cudaMemcpyDeviceToDevice, push_top_stream)); 440 | CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); 441 | 442 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); 443 | CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), 444 | cudaMemcpyDeviceToDevice, compute_stream)); 445 | CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); 446 | 447 | if (calculate_norm) { 448 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 449 | l2_norm = *l2_norm_h; 450 | l2_norm = std::sqrt(l2_norm); 451 | if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); 452 | } 453 | 454 | std::swap(a_new, a); 455 | iter++; 456 | } 457 | POP_RANGE 458 | double stop = MPI_Wtime(); 459 | 460 | CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); 461 | 462 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 463 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 464 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 465 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 466 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 467 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 468 | 469 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 470 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 471 | 472 | CUDA_RT_CALL(cudaFree(a_new)); 473 | CUDA_RT_CALL(cudaFree(a)); 474 | return (stop - start); 475 | } 476 | -------------------------------------------------------------------------------- /mpi/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | 30 | #ifdef HAVE_CUB 31 | #include 32 | #endif // HAVE_CUB 33 | 34 | #define CUDA_RT_CALL(call) \ 35 | { \ 36 | cudaError_t cudaStatus = call; \ 37 | if (cudaSuccess != cudaStatus) { \ 38 | fprintf(stderr, \ 39 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 40 | "with " \ 41 | "%s (%d).\n", \ 42 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 43 | exit( cudaStatus ); \ 44 | } \ 45 | } 46 | 47 | #ifdef USE_DOUBLE 48 | typedef double real; 49 | #define MPI_REAL_TYPE MPI_DOUBLE 50 | #else 51 | typedef float real; 52 | #define MPI_REAL_TYPE MPI_FLOAT 53 | #endif 54 | 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 56 | const real pi, const int offset, const int nx, 57 | const int my_ny, const int ny) { 58 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 59 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 60 | a[iy * nx + 0] = y0; 61 | a[iy * nx + (nx - 1)] = y0; 62 | a_new[iy * nx + 0] = y0; 63 | a_new[iy * nx + (nx - 1)] = y0; 64 | } 65 | } 66 | 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 68 | const real pi, const int offset, const int nx, const int my_ny, 69 | const int ny) { 70 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 71 | CUDA_RT_CALL(cudaGetLastError()); 72 | } 73 | 74 | template 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 76 | real* __restrict__ const l2_norm, const int iy_start, 77 | const int iy_end, const int nx, const bool calculate_norm) { 78 | #ifdef HAVE_CUB 79 | typedef cub::BlockReduce 80 | BlockReduce; 81 | __shared__ typename BlockReduce::TempStorage temp_storage; 82 | #endif // HAVE_CUB 83 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 84 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 85 | real local_l2_norm = 0.0; 86 | 87 | if (iy < iy_end && ix < (nx - 1)) { 88 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 89 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 90 | a_new[iy * nx + ix] = new_val; 91 | if (calculate_norm) { 92 | real residue = new_val - a[iy * nx + ix]; 93 | local_l2_norm += residue * residue; 94 | } 95 | } 96 | if (calculate_norm) { 97 | #ifdef HAVE_CUB 98 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 99 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 100 | #else 101 | atomicAdd(l2_norm, local_l2_norm); 102 | #endif // HAVE_CUB 103 | } 104 | } 105 | 106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 107 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 108 | const int nx, const bool calculate_norm, cudaStream_t stream) { 109 | constexpr int dim_block_x = 32; 110 | constexpr int dim_block_y = 32; 111 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 112 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 113 | jacobi_kernel<<>>( 114 | a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); 115 | CUDA_RT_CALL(cudaGetLastError()); 116 | } 117 | -------------------------------------------------------------------------------- /mpi_overlap/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPICXX=mpicxx 5 | MPIRUN ?= mpirun 6 | CUDA_HOME ?= /usr/local/cuda 7 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 8 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 9 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 10 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 11 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 12 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 13 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 14 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 15 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 16 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 17 | ifdef DISABLE_CUB 18 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 19 | else 20 | NVCC_FLAGS = -DHAVE_CUB 21 | endif 22 | ifdef SKIP_CUDA_AWARENESS_CHECK 23 | MPICXX_FLAGS = -DSKIP_CUDA_AWARENESS_CHECK 24 | endif 25 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 26 | MPICXX_FLAGS += -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14 27 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -ldl 28 | jacobi: Makefile jacobi.cpp jacobi_kernels.o 29 | $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi 30 | 31 | jacobi_kernels.o: Makefile jacobi_kernels.cu 32 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 33 | 34 | .PHONY.: clean 35 | clean: 36 | rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log 37 | 38 | sanitize: jacobi 39 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 40 | 41 | run: jacobi 42 | $(MPIRUN) -np $(NP) ./jacobi 43 | 44 | profile: jacobi 45 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 46 | -------------------------------------------------------------------------------- /mpi_overlap/jacobi.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #ifndef SKIP_CUDA_AWARENESS_CHECK 37 | #include 38 | #if !defined(MPIX_CUDA_AWARE_SUPPORT) || !MPIX_CUDA_AWARE_SUPPORT 39 | #error "The used MPI Implementation does not have CUDA-aware support or CUDA-aware \ 40 | support can't be determined. Define SKIP_CUDA_AWARENESS_CHECK to skip this check." 41 | #endif 42 | #endif 43 | 44 | #define MPI_CALL(call) \ 45 | { \ 46 | int mpi_status = call; \ 47 | if (MPI_SUCCESS != mpi_status) { \ 48 | char mpi_error_string[MPI_MAX_ERROR_STRING]; \ 49 | int mpi_error_string_length = 0; \ 50 | MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \ 51 | if (NULL != mpi_error_string) \ 52 | fprintf(stderr, \ 53 | "ERROR: MPI call \"%s\" in line %d of file %s failed " \ 54 | "with %s " \ 55 | "(%d).\n", \ 56 | #call, __LINE__, __FILE__, mpi_error_string, mpi_status); \ 57 | else \ 58 | fprintf(stderr, \ 59 | "ERROR: MPI call \"%s\" in line %d of file %s failed " \ 60 | "with %d.\n", \ 61 | #call, __LINE__, __FILE__, mpi_status); \ 62 | exit( mpi_status ); \ 63 | } \ 64 | } 65 | 66 | #include 67 | 68 | #ifdef USE_NVTX 69 | #include 70 | 71 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 72 | 0x0000ffff, 0x00ff0000, 0x00ffffff}; 73 | const int num_colors = sizeof(colors) / sizeof(uint32_t); 74 | 75 | #define PUSH_RANGE(name, cid) \ 76 | { \ 77 | int color_id = cid; \ 78 | color_id = color_id % num_colors; \ 79 | nvtxEventAttributes_t eventAttrib = {0}; \ 80 | eventAttrib.version = NVTX_VERSION; \ 81 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 82 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 83 | eventAttrib.color = colors[color_id]; \ 84 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 85 | eventAttrib.message.ascii = name; \ 86 | nvtxRangePushEx(&eventAttrib); \ 87 | } 88 | #define POP_RANGE nvtxRangePop(); 89 | #else 90 | #define PUSH_RANGE(name, cid) 91 | #define POP_RANGE 92 | #endif 93 | 94 | #define CUDA_RT_CALL(call) \ 95 | { \ 96 | cudaError_t cudaStatus = call; \ 97 | if (cudaSuccess != cudaStatus) { \ 98 | fprintf(stderr, \ 99 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 100 | "with " \ 101 | "%s (%d).\n", \ 102 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 103 | exit( cudaStatus ); \ 104 | } \ 105 | } 106 | 107 | #ifdef USE_DOUBLE 108 | typedef double real; 109 | #define MPI_REAL_TYPE MPI_DOUBLE 110 | #else 111 | typedef float real; 112 | #define MPI_REAL_TYPE MPI_FLOAT 113 | #endif 114 | 115 | constexpr real tol = 1.0e-8; 116 | 117 | const real PI = 2.0 * std::asin(1.0); 118 | 119 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 120 | const real pi, const int offset, const int nx, const int my_ny, 121 | const int ny); 122 | 123 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 124 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 125 | const int nx, const bool calculate_norm, cudaStream_t stream); 126 | 127 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 128 | const int nccheck, const bool print); 129 | 130 | template 131 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { 132 | T argval = default_val; 133 | char** itr = std::find(begin, end, arg); 134 | if (itr != end && ++itr != end) { 135 | std::istringstream inbuf(*itr); 136 | inbuf >> argval; 137 | } 138 | return argval; 139 | } 140 | 141 | bool get_arg(char** begin, char** end, const std::string& arg) { 142 | char** itr = std::find(begin, end, arg); 143 | if (itr != end) { 144 | return true; 145 | } 146 | return false; 147 | } 148 | 149 | int main(int argc, char* argv[]) { 150 | MPI_CALL(MPI_Init(&argc, &argv)); 151 | #if !defined(SKIP_CUDA_AWARENESS_CHECK) && defined(MPIX_CUDA_AWARE_SUPPORT) 152 | if (1 != MPIX_Query_cuda_support()) { 153 | fprintf(stderr, "The used MPI Implementation does not have CUDA-aware support enabled!\n"); 154 | MPI_CALL(MPI_Finalize()); 155 | return -1; 156 | } 157 | #endif 158 | int rank; 159 | MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); 160 | int size; 161 | MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size)); 162 | int num_devices = 0; 163 | CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); 164 | 165 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 166 | const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); 167 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 168 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 169 | const bool csv = get_arg(argv, argv + argc, "-csv"); 170 | const bool use_hp_streams = get_arg(argv, argv + argc, "-use_hp_streams"); 171 | 172 | if ( nccheck > 1 && !use_hp_streams && 0 == rank ) 173 | { 174 | fprintf(stderr, 175 | "WARN: When not calculating the norm in every iteration kernels might be executed in " 176 | "an order that breaks communication computation overlap. Also enable -use_hp_streams " 177 | "to avoid this issue.\n"); 178 | } 179 | 180 | int local_rank = -1; 181 | { 182 | MPI_Comm local_comm; 183 | MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, 184 | &local_comm)); 185 | 186 | MPI_CALL(MPI_Comm_rank(local_comm, &local_rank)); 187 | 188 | MPI_CALL(MPI_Comm_free(&local_comm)); 189 | } 190 | 191 | CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices)); 192 | CUDA_RT_CALL(cudaFree(0)); 193 | 194 | real* a_ref_h; 195 | CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); 196 | real* a_h; 197 | CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); 198 | double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank)); 199 | 200 | // ny - 2 rows are distributed amongst `size` ranks in such a way 201 | // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. 202 | // This optimizes load balancing when (ny - 2) % size != 0 203 | int chunk_size; 204 | int chunk_size_low = (ny - 2) / size; 205 | int chunk_size_high = chunk_size_low + 1; 206 | // To calculate the number of ranks that need to compute an extra row, 207 | // the following formula is derived from this equation: 208 | // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 209 | int num_ranks_low = size * chunk_size_low + size - 210 | (ny - 2); // Number of ranks with chunk_size = chunk_size_low 211 | if (rank < num_ranks_low) 212 | chunk_size = chunk_size_low; 213 | else 214 | chunk_size = chunk_size_high; 215 | 216 | real* a; 217 | CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); 218 | real* a_new; 219 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); 220 | 221 | CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); 222 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); 223 | 224 | // Calculate local domain boundaries 225 | int iy_start_global; // My start index in the global array 226 | if (rank < num_ranks_low) { 227 | iy_start_global = rank * chunk_size_low + 1; 228 | } else { 229 | iy_start_global = 230 | num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1; 231 | } 232 | int iy_end_global = iy_start_global + chunk_size - 1; // My last index in the global array 233 | 234 | int iy_start = 1; 235 | int iy_end = iy_start + chunk_size; 236 | 237 | // Set diriclet boundary conditions on left and right boarder 238 | launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny); 239 | CUDA_RT_CALL(cudaDeviceSynchronize()); 240 | 241 | int leastPriority = 0; 242 | int greatestPriority = leastPriority; 243 | CUDA_RT_CALL(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); 244 | cudaStream_t compute_stream; 245 | cudaStream_t push_top_stream; 246 | cudaStream_t push_bottom_stream; 247 | if (use_hp_streams) { 248 | CUDA_RT_CALL(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, leastPriority)); 249 | CUDA_RT_CALL( 250 | cudaStreamCreateWithPriority(&push_top_stream, cudaStreamDefault, greatestPriority)); 251 | CUDA_RT_CALL( 252 | cudaStreamCreateWithPriority(&push_bottom_stream, cudaStreamDefault, greatestPriority)); 253 | } else { 254 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 255 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 256 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 257 | } 258 | 259 | cudaEvent_t push_top_done; 260 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 261 | cudaEvent_t push_bottom_done; 262 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 263 | cudaEvent_t reset_l2norm_done; 264 | CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2norm_done, cudaEventDisableTiming)); 265 | 266 | real* l2_norm_d; 267 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 268 | real* l2_norm_h; 269 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 270 | 271 | PUSH_RANGE("MPI_Warmup", 5) 272 | for (int i = 0; i < 10; ++i) { 273 | const int top = rank > 0 ? rank - 1 : (size - 1); 274 | const int bottom = (rank + 1) % size; 275 | MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, 276 | a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, 277 | MPI_STATUS_IGNORE)); 278 | MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, 279 | MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); 280 | std::swap(a_new, a); 281 | } 282 | POP_RANGE 283 | 284 | CUDA_RT_CALL(cudaDeviceSynchronize()); 285 | 286 | if (!csv && 0 == rank) { 287 | printf( 288 | "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " 289 | "every %d iterations\n", 290 | iter_max, ny, nx, nccheck); 291 | } 292 | 293 | int iter = 0; 294 | bool calculate_norm = true; 295 | real l2_norm = 1.0; 296 | 297 | MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); 298 | double start = MPI_Wtime(); 299 | PUSH_RANGE("Jacobi solve", 0) 300 | while (l2_norm > tol && iter < iter_max) { 301 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 302 | CUDA_RT_CALL(cudaEventRecord(reset_l2norm_done, compute_stream)); 303 | 304 | if (use_hp_streams) { 305 | launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, 306 | calculate_norm, compute_stream); 307 | } 308 | 309 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, reset_l2norm_done, 0)); 310 | calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); 311 | launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, 312 | push_top_stream); 313 | CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); 314 | 315 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, reset_l2norm_done, 0)); 316 | launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, 317 | push_bottom_stream); 318 | CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); 319 | 320 | if (!use_hp_streams) { 321 | launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, 322 | calculate_norm, compute_stream); 323 | } 324 | 325 | if (calculate_norm) { 326 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); 327 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); 328 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 329 | compute_stream)); 330 | } 331 | 332 | const int top = rank > 0 ? rank - 1 : (size - 1); 333 | const int bottom = (rank + 1) % size; 334 | 335 | // Apply periodic boundary conditions 336 | CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream)); 337 | PUSH_RANGE("MPI", 5) 338 | MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, 339 | a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, 340 | MPI_STATUS_IGNORE)); 341 | CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream)); 342 | MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, 343 | MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); 344 | POP_RANGE 345 | 346 | if (calculate_norm) { 347 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 348 | MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD)); 349 | l2_norm = std::sqrt(l2_norm); 350 | 351 | if (!csv && 0 == rank && (iter % 100) == 0) { 352 | printf("%5d, %0.6f\n", iter, l2_norm); 353 | } 354 | } 355 | 356 | std::swap(a_new, a); 357 | iter++; 358 | } 359 | double stop = MPI_Wtime(); 360 | POP_RANGE 361 | 362 | CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx, 363 | std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), 364 | cudaMemcpyDeviceToHost)); 365 | 366 | int result_correct = 1; 367 | for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) { 368 | for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { 369 | if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { 370 | fprintf(stderr, 371 | "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f " 372 | "(reference)\n", 373 | rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); 374 | result_correct = 0; 375 | } 376 | } 377 | } 378 | 379 | int global_result_correct = 1; 380 | MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, 381 | MPI_COMM_WORLD)); 382 | result_correct = global_result_correct; 383 | 384 | if (rank == 0 && result_correct) { 385 | if (csv) { 386 | printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, 387 | (stop - start), runtime_serial); 388 | } else { 389 | printf("Num GPUs: %d.\n", size); 390 | printf( 391 | "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, " 392 | "efficiency: %8.2f \n", 393 | ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start), 394 | runtime_serial / (size * (stop - start)) * 100); 395 | } 396 | } 397 | CUDA_RT_CALL(cudaEventDestroy(reset_l2norm_done)); 398 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 399 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 400 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 401 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 402 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 403 | 404 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 405 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 406 | 407 | CUDA_RT_CALL(cudaFree(a_new)); 408 | CUDA_RT_CALL(cudaFree(a)); 409 | 410 | CUDA_RT_CALL(cudaFreeHost(a_h)); 411 | CUDA_RT_CALL(cudaFreeHost(a_ref_h)); 412 | 413 | MPI_CALL(MPI_Finalize()); 414 | return (result_correct == 1) ? 0 : 1; 415 | } 416 | 417 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 418 | const int nccheck, const bool print) { 419 | real* a; 420 | real* a_new; 421 | 422 | cudaStream_t compute_stream; 423 | cudaStream_t push_top_stream; 424 | cudaStream_t push_bottom_stream; 425 | cudaEvent_t compute_done; 426 | cudaEvent_t push_top_done; 427 | cudaEvent_t push_bottom_done; 428 | 429 | real* l2_norm_d; 430 | real* l2_norm_h; 431 | 432 | int iy_start = 1; 433 | int iy_end = (ny - 1); 434 | 435 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); 436 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); 437 | 438 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 439 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 440 | 441 | // Set diriclet boundary conditions on left and right boarder 442 | launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny); 443 | CUDA_RT_CALL(cudaDeviceSynchronize()); 444 | 445 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 446 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 447 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 448 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 449 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 450 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 451 | 452 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 453 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 454 | 455 | CUDA_RT_CALL(cudaDeviceSynchronize()); 456 | 457 | if (print) 458 | printf( 459 | "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " 460 | "norm " 461 | "check every %d iterations\n", 462 | iter_max, ny, nx, nccheck); 463 | 464 | int iter = 0; 465 | bool calculate_norm = true; 466 | real l2_norm = 1.0; 467 | 468 | double start = MPI_Wtime(); 469 | PUSH_RANGE("Jacobi solve", 0) 470 | while (l2_norm > tol && iter < iter_max) { 471 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 472 | 473 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); 474 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); 475 | 476 | calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0; 477 | launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm, 478 | compute_stream); 479 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 480 | 481 | if (calculate_norm) { 482 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 483 | compute_stream)); 484 | } 485 | 486 | // Apply periodic boundary conditions 487 | 488 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); 489 | CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), 490 | cudaMemcpyDeviceToDevice, push_top_stream)); 491 | CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); 492 | 493 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); 494 | CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), 495 | cudaMemcpyDeviceToDevice, compute_stream)); 496 | CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); 497 | 498 | if (calculate_norm) { 499 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 500 | l2_norm = *l2_norm_h; 501 | l2_norm = std::sqrt(l2_norm); 502 | if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); 503 | } 504 | 505 | std::swap(a_new, a); 506 | iter++; 507 | } 508 | POP_RANGE 509 | double stop = MPI_Wtime(); 510 | 511 | CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); 512 | 513 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 514 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 515 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 516 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 517 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 518 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 519 | 520 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 521 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 522 | 523 | CUDA_RT_CALL(cudaFree(a_new)); 524 | CUDA_RT_CALL(cudaFree(a)); 525 | return (stop - start); 526 | } 527 | -------------------------------------------------------------------------------- /mpi_overlap/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | 30 | #ifdef HAVE_CUB 31 | #include 32 | #endif // HAVE_CUB 33 | 34 | #define CUDA_RT_CALL(call) \ 35 | { \ 36 | cudaError_t cudaStatus = call; \ 37 | if (cudaSuccess != cudaStatus) { \ 38 | fprintf(stderr, \ 39 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 40 | "with " \ 41 | "%s (%d).\n", \ 42 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 43 | exit( cudaStatus ); \ 44 | } \ 45 | } 46 | 47 | #ifdef USE_DOUBLE 48 | typedef double real; 49 | #define MPI_REAL_TYPE MPI_DOUBLE 50 | #else 51 | typedef float real; 52 | #define MPI_REAL_TYPE MPI_FLOAT 53 | #endif 54 | 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 56 | const real pi, const int offset, const int nx, 57 | const int my_ny, const int ny) { 58 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 59 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 60 | a[iy * nx + 0] = y0; 61 | a[iy * nx + (nx - 1)] = y0; 62 | a_new[iy * nx + 0] = y0; 63 | a_new[iy * nx + (nx - 1)] = y0; 64 | } 65 | } 66 | 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 68 | const real pi, const int offset, const int nx, const int my_ny, 69 | const int ny) { 70 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 71 | CUDA_RT_CALL(cudaGetLastError()); 72 | } 73 | 74 | template 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 76 | real* __restrict__ const l2_norm, const int iy_start, 77 | const int iy_end, const int nx, const bool calculate_norm) { 78 | #ifdef HAVE_CUB 79 | typedef cub::BlockReduce 80 | BlockReduce; 81 | __shared__ typename BlockReduce::TempStorage temp_storage; 82 | #endif // HAVE_CUB 83 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 84 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 85 | real local_l2_norm = 0.0; 86 | 87 | if (iy < iy_end && ix < (nx - 1)) { 88 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 89 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 90 | a_new[iy * nx + ix] = new_val; 91 | if (calculate_norm) { 92 | real residue = new_val - a[iy * nx + ix]; 93 | local_l2_norm += residue * residue; 94 | } 95 | } 96 | if (calculate_norm) { 97 | #ifdef HAVE_CUB 98 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 99 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 100 | #else 101 | atomicAdd(l2_norm, local_l2_norm); 102 | #endif // HAVE_CUB 103 | } 104 | } 105 | 106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 107 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 108 | const int nx, const bool calculate_norm, cudaStream_t stream) { 109 | constexpr int dim_block_x = 32; 110 | constexpr int dim_block_y = 32; 111 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 112 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 113 | jacobi_kernel<<>>( 114 | a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); 115 | CUDA_RT_CALL(cudaGetLastError()); 116 | } 117 | -------------------------------------------------------------------------------- /multi_node_p2p/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPICXX=mpicxx 5 | MPIRUN ?= mpirun 6 | CUDA_HOME ?= /usr/local/cuda 7 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 8 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 9 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 10 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 11 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 12 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 13 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 14 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 15 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 16 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 17 | ifdef DISABLE_CUB 18 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 19 | else 20 | NVCC_FLAGS = -DHAVE_CUB 21 | endif 22 | ifdef SKIP_CUDA_AWARENESS_CHECK 23 | MPICXX_FLAGS = -DSKIP_CUDA_AWARENESS_CHECK 24 | endif 25 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 26 | MPICXX_FLAGS += -DUSE_NVTX -I$(CUDA_HOME)/include -std=c++14 27 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lcuda -ldl 28 | jacobi: Makefile jacobi.cpp jacobi_kernels.o 29 | $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi 30 | 31 | jacobi_kernels.o: Makefile jacobi_kernels.cu 32 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 33 | 34 | .PHONY.: clean 35 | clean: 36 | rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log 37 | 38 | sanitize: jacobi 39 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 40 | 41 | run: jacobi 42 | $(MPIRUN) -np $(NP) ./jacobi 43 | 44 | profile: jacobi 45 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 46 | -------------------------------------------------------------------------------- /multi_node_p2p/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, 2024, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #ifdef HAVE_CUB 34 | #include 35 | #endif // HAVE_CUB 36 | 37 | #define CUDA_RT_CALL(call) \ 38 | { \ 39 | cudaError_t cudaStatus = call; \ 40 | if (cudaSuccess != cudaStatus) { \ 41 | fprintf(stderr, \ 42 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 43 | "with " \ 44 | "%s (%d).\n", \ 45 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 46 | exit(cudaStatus); \ 47 | } \ 48 | } 49 | 50 | #ifdef USE_DOUBLE 51 | typedef double real; 52 | #define MPI_REAL_TYPE MPI_DOUBLE 53 | #else 54 | typedef float real; 55 | #define MPI_REAL_TYPE MPI_FLOAT 56 | #endif 57 | 58 | struct real_int_pair { 59 | real value; 60 | unsigned int arrival_counter; 61 | }; 62 | 63 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 64 | const real pi, const int offset, const int nx, 65 | const int my_ny, const int ny) { 66 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 67 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 68 | a[iy * nx + 0] = y0; 69 | a[iy * nx + (nx - 1)] = y0; 70 | a_new[iy * nx + 0] = y0; 71 | a_new[iy * nx + (nx - 1)] = y0; 72 | } 73 | } 74 | 75 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 76 | const real pi, const int offset, const int nx, const int my_ny, 77 | const int ny) { 78 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 79 | CUDA_RT_CALL(cudaGetLastError()); 80 | } 81 | 82 | template 83 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 84 | real* __restrict__ const l2_norm, const int iy_start, 85 | const int iy_end, const int nx, const bool calculate_norm) { 86 | #ifdef HAVE_CUB 87 | typedef cub::BlockReduce 88 | BlockReduce; 89 | __shared__ typename BlockReduce::TempStorage temp_storage; 90 | #endif // HAVE_CUB 91 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 92 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 93 | real local_l2_norm = 0.0; 94 | 95 | if (iy < iy_end && ix < (nx - 1)) { 96 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 97 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 98 | a_new[iy * nx + ix] = new_val; 99 | if (calculate_norm) { 100 | real residue = new_val - a[iy * nx + ix]; 101 | local_l2_norm += residue * residue; 102 | } 103 | } 104 | if (calculate_norm) { 105 | #ifdef HAVE_CUB 106 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 107 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 108 | #else 109 | atomicAdd(l2_norm, local_l2_norm); 110 | #endif // HAVE_CUB 111 | } 112 | } 113 | 114 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 115 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 116 | const int nx, const bool calculate_norm, cudaStream_t stream) { 117 | constexpr int dim_block_x = 32; 118 | constexpr int dim_block_y = 32; 119 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 120 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 121 | jacobi_kernel<<>>( 122 | a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); 123 | CUDA_RT_CALL(cudaGetLastError()); 124 | } 125 | 126 | template 127 | __global__ void jacobi_p2p_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 128 | real* __restrict__ const l2_norm, const int iy_start, 129 | const int iy_end, const int nx, 130 | real* __restrict__ const a_new_top, const int top_iy, 131 | real* __restrict__ const a_new_bottom, const int bottom_iy, 132 | const bool calculate_norm) { 133 | #ifdef HAVE_CUB 134 | typedef cub::BlockReduce 135 | BlockReduce; 136 | __shared__ typename BlockReduce::TempStorage temp_storage; 137 | #endif // HAVE_CUB 138 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 139 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 140 | real local_l2_norm = 0.0; 141 | 142 | if (iy < iy_end && ix < (nx - 1)) { 143 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 144 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 145 | a_new[iy * nx + ix] = new_val; 146 | 147 | if (iy_start == iy) { 148 | a_new_top[top_iy * nx + ix] = new_val; 149 | } 150 | 151 | if ((iy_end - 1) == iy) { 152 | a_new_bottom[bottom_iy * nx + ix] = new_val; 153 | } 154 | 155 | if (calculate_norm) { 156 | real residue = new_val - a[iy * nx + ix]; 157 | local_l2_norm += residue * residue; 158 | } 159 | } 160 | if (calculate_norm) { 161 | #ifdef HAVE_CUB 162 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 163 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 164 | #else 165 | atomicAdd(l2_norm, local_l2_norm); 166 | #endif // HAVE_CUB 167 | } 168 | } 169 | 170 | void launch_jacobi_p2p_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 171 | real* __restrict__ const l2_norm, const int iy_start, 172 | const int iy_end, const int nx, real* __restrict__ const a_new_top, 173 | const int top_iy, real* __restrict__ const a_new_bottom, 174 | const int bottom_iy, const bool calculate_norm, cudaStream_t stream) { 175 | constexpr int dim_block_x = 32; 176 | constexpr int dim_block_y = 32; 177 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 178 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 179 | jacobi_p2p_kernel 180 | <<>>( 181 | a_new, a, l2_norm, iy_start, iy_end, nx, a_new_top, top_iy, a_new_bottom, bottom_iy,calculate_norm); 182 | CUDA_RT_CALL(cudaGetLastError()); 183 | } 184 | 185 | __global__ void all_reduce_norm_barrier_kernel(real* const l2_norm, 186 | real_int_pair* partial_l2_norm_uc, 187 | real_int_pair* partial_l2_norm_mc, 188 | const unsigned int expected_count) { 189 | assert(1 == blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * gridDim.z); 190 | real l2_norm_sum = 0.0; 191 | #if __CUDA_ARCH__ >= 900 192 | // atomic reduction to all replicas 193 | // this can be conceptually thought of as __threadfence_system(); atomicAdd_system(arrival_counter_mc, 1); 194 | // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red 195 | // for multimem PTX doc 196 | asm volatile ("multimem.red.release.sys.global.add.u32 [%0], %1;" ::"l"(&(partial_l2_norm_mc->arrival_counter)), "n"(1) : "memory"); 197 | 198 | // Need a fence between MC and UC access to the same memory: 199 | // - fence.proxy instructions establish an ordering between memory accesses that may happen through different proxies 200 | // - Value .alias of the .proxykind qualifier refers to memory accesses performed using virtually aliased addresses to the same memory location. 201 | // from https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar 202 | asm volatile ("fence.proxy.alias;" ::: "memory"); 203 | 204 | // spin wait with acquire ordering on UC mapping till all peers have arrived in this iteration 205 | // Note: all ranks reach an MPI_Barrier after this kernel, such that it is not possible for the barrier to be unblocked by an 206 | // arrival of a rank for the next iteration if some other rank is slow. 207 | cuda::atomic_ref ac(partial_l2_norm_uc->arrival_counter); 208 | while (expected_count > ac.load(cuda::memory_order_acquire)); 209 | 210 | // Atomic load reduction from all replicas. It does not provide ordering so it can be relaxed. 211 | #ifdef USE_DOUBLE 212 | asm volatile ("multimem.ld_reduce.relaxed.sys.global.add.f64 %0, [%1];" : "=d"(l2_norm_sum) : "l"(&(partial_l2_norm_mc->value)) : "memory"); 213 | #else 214 | asm volatile ("multimem.ld_reduce.relaxed.sys.global.add.f32 %0, [%1];" : "=f"(l2_norm_sum) : "l"(&(partial_l2_norm_mc->value)) : "memory"); 215 | #endif 216 | #endif 217 | *l2_norm = std::sqrt(l2_norm_sum); 218 | } 219 | 220 | void launch_all_reduce_norm_barrier_kernel(real* __restrict__ const l2_norm, 221 | real_int_pair* __restrict__ partial_l2_norm_uc, 222 | real_int_pair* __restrict__ partial_l2_norm_mc, 223 | const int num_gpus, const int iter, 224 | cudaStream_t stream) { 225 | // calculating expected count as unsigned for well defined overflow to correctly handle large 226 | // iteration counts with many GPUs 227 | unsigned int expected_count = num_gpus; 228 | // iter starts at 0 so need to scale with iter+1 229 | expected_count *= (iter + 1); 230 | all_reduce_norm_barrier_kernel<<<1, 1, 0, stream>>>(l2_norm, partial_l2_norm_uc, 231 | partial_l2_norm_mc, expected_count); 232 | CUDA_RT_CALL(cudaGetLastError()); 233 | } 234 | -------------------------------------------------------------------------------- /multi_threaded_copy/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /multi_threaded_copy/jacobi.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #ifdef HAVE_CUB 37 | #include 38 | #endif // HAVE_CUB 39 | 40 | #ifdef USE_NVTX 41 | #include 42 | 43 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 44 | 0x0000ffff, 0x00ff0000, 0x00ffffff}; 45 | const int num_colors = sizeof(colors) / sizeof(uint32_t); 46 | 47 | #define PUSH_RANGE(name, cid) \ 48 | { \ 49 | int color_id = cid; \ 50 | color_id = color_id % num_colors; \ 51 | nvtxEventAttributes_t eventAttrib = {0}; \ 52 | eventAttrib.version = NVTX_VERSION; \ 53 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 54 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 55 | eventAttrib.color = colors[color_id]; \ 56 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 57 | eventAttrib.message.ascii = name; \ 58 | nvtxRangePushEx(&eventAttrib); \ 59 | } 60 | #define POP_RANGE nvtxRangePop(); 61 | #else 62 | #define PUSH_RANGE(name, cid) 63 | #define POP_RANGE 64 | #endif 65 | 66 | #define CUDA_RT_CALL(call) \ 67 | { \ 68 | cudaError_t cudaStatus = call; \ 69 | if (cudaSuccess != cudaStatus) { \ 70 | fprintf(stderr, \ 71 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 72 | "with " \ 73 | "%s (%d).\n", \ 74 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 75 | exit( cudaStatus ); \ 76 | } \ 77 | } 78 | 79 | constexpr int MAX_NUM_DEVICES = 32; 80 | 81 | typedef float real; 82 | constexpr real tol = 1.0e-8; 83 | 84 | const real PI = 2.0 * std::asin(1.0); 85 | 86 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 87 | const real pi, const int offset, const int nx, 88 | const int my_ny, const int ny) { 89 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 90 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 91 | a[iy * nx + 0] = y0; 92 | a[iy * nx + (nx - 1)] = y0; 93 | a_new[iy * nx + 0] = y0; 94 | a_new[iy * nx + (nx - 1)] = y0; 95 | } 96 | } 97 | 98 | template 99 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 100 | real* __restrict__ const l2_norm, const int iy_start, 101 | const int iy_end, const int nx, const bool calculate_norm) { 102 | #ifdef HAVE_CUB 103 | typedef cub::BlockReduce 104 | BlockReduce; 105 | __shared__ typename BlockReduce::TempStorage temp_storage; 106 | #endif // HAVE_CUB 107 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 108 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 109 | real local_l2_norm = 0.0; 110 | 111 | if (iy < iy_end && ix < (nx - 1)) { 112 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 113 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 114 | a_new[iy * nx + ix] = new_val; 115 | if (calculate_norm) { 116 | real residue = new_val - a[iy * nx + ix]; 117 | local_l2_norm += residue * residue; 118 | } 119 | } 120 | if (calculate_norm) { 121 | #ifdef HAVE_CUB 122 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 123 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 124 | #else 125 | atomicAdd(l2_norm, local_l2_norm); 126 | #endif // HAVE_CUB 127 | } 128 | } 129 | 130 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 131 | const int nccheck, const bool print); 132 | 133 | template 134 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { 135 | T argval = default_val; 136 | char** itr = std::find(begin, end, arg); 137 | if (itr != end && ++itr != end) { 138 | std::istringstream inbuf(*itr); 139 | inbuf >> argval; 140 | } 141 | return argval; 142 | } 143 | 144 | bool get_arg(char** begin, char** end, const std::string& arg) { 145 | char** itr = std::find(begin, end, arg); 146 | if (itr != end) { 147 | return true; 148 | } 149 | return false; 150 | } 151 | 152 | int main(int argc, char* argv[]) { 153 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 154 | const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); 155 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 156 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 157 | const bool csv = get_arg(argv, argv + argc, "-csv"); 158 | 159 | real* a_new[MAX_NUM_DEVICES]; 160 | 161 | real* a_ref_h; 162 | real* a_h; 163 | double runtime_serial = 0.0; 164 | 165 | int iy_end[MAX_NUM_DEVICES]; 166 | 167 | cudaEvent_t push_top_done[2][MAX_NUM_DEVICES]; 168 | cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES]; 169 | 170 | bool result_correct = true; 171 | int num_devices = 0; 172 | CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); 173 | real l2_norm = 1.0; 174 | #pragma omp parallel num_threads(num_devices) shared(l2_norm) 175 | { 176 | real* a; 177 | 178 | cudaStream_t compute_stream; 179 | cudaStream_t push_top_stream; 180 | cudaStream_t push_bottom_stream; 181 | cudaEvent_t compute_done; 182 | 183 | real* l2_norm_d; 184 | real* l2_norm_h; 185 | 186 | int dev_id = omp_get_thread_num(); 187 | 188 | CUDA_RT_CALL(cudaSetDevice(dev_id)); 189 | CUDA_RT_CALL(cudaFree(0)); 190 | 191 | if (0 == dev_id) { 192 | CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); 193 | CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); 194 | runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv); 195 | } 196 | #pragma omp barrier 197 | // ny - 2 rows are distributed amongst `size` ranks in such a way 198 | // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. 199 | // This optimizes load balancing when (ny - 2) % size != 0 200 | int chunk_size; 201 | int chunk_size_low = (ny - 2) / num_devices; 202 | int chunk_size_high = chunk_size_low + 1; 203 | // To calculate the number of ranks that need to compute an extra row, 204 | // the following formula is derived from this equation: 205 | // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 206 | int num_ranks_low = num_devices * chunk_size_low + num_devices - 207 | (ny - 2); // Number of ranks with chunk_size = chunk_size_low 208 | if (dev_id < num_ranks_low) 209 | chunk_size = chunk_size_low; 210 | else 211 | chunk_size = chunk_size_high; 212 | 213 | CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); 214 | CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size + 2) * sizeof(real))); 215 | 216 | CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); 217 | CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size + 2) * sizeof(real))); 218 | 219 | // Calculate local domain boundaries 220 | int iy_start_global; // My start index in the global array 221 | if (dev_id < num_ranks_low) { 222 | iy_start_global = dev_id * chunk_size_low + 1; 223 | } else { 224 | iy_start_global = 225 | num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1; 226 | } 227 | 228 | int iy_start = 1; 229 | iy_end[dev_id] = iy_start + chunk_size; 230 | 231 | // Set diriclet boundary conditions on left and right boarder 232 | initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>( 233 | a, a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size + 2), ny); 234 | CUDA_RT_CALL(cudaGetLastError()); 235 | CUDA_RT_CALL(cudaDeviceSynchronize()); 236 | 237 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 238 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 239 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 240 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 241 | CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[0] + dev_id, cudaEventDisableTiming)); 242 | CUDA_RT_CALL( 243 | cudaEventCreateWithFlags(push_bottom_done[0] + dev_id, cudaEventDisableTiming)); 244 | CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[1] + dev_id, cudaEventDisableTiming)); 245 | CUDA_RT_CALL( 246 | cudaEventCreateWithFlags(push_bottom_done[1] + dev_id, cudaEventDisableTiming)); 247 | 248 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 249 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 250 | 251 | const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1); 252 | int canAccessPeer = 0; 253 | CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top)); 254 | if (canAccessPeer) { 255 | CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0)); 256 | } 257 | const int bottom = (dev_id + 1) % num_devices; 258 | if (top != bottom) { 259 | canAccessPeer = 0; 260 | CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom)); 261 | if (canAccessPeer) { 262 | CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0)); 263 | } 264 | } 265 | 266 | for (int i = 0; i < 4; ++i) { 267 | CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx), 268 | a_new[dev_id] + iy_start * nx, nx * sizeof(real), 269 | cudaMemcpyDeviceToDevice, push_top_stream)); 270 | CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx, 271 | nx * sizeof(real), cudaMemcpyDeviceToDevice, 272 | push_bottom_stream)); 273 | CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream)); 274 | CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream)); 275 | std::swap(a_new[dev_id], a); 276 | } 277 | 278 | CUDA_RT_CALL(cudaDeviceSynchronize()); 279 | 280 | #pragma omp master 281 | { 282 | if (!csv) 283 | printf( 284 | "Jacobi relaxation: %d iterations on %d x %d mesh with " 285 | "norm check " 286 | "every %d iterations\n", 287 | iter_max, ny, nx, nccheck); 288 | } 289 | 290 | constexpr int dim_block_x = 32; 291 | constexpr int dim_block_y = 32; 292 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 293 | (ny + (num_devices * dim_block_y) - 1) / (num_devices * dim_block_y), 1); 294 | 295 | int iter = 0; 296 | bool calculate_norm = true; 297 | #pragma omp master 298 | { l2_norm = 1.0; } 299 | 300 | CUDA_RT_CALL(cudaDeviceSynchronize()); 301 | #pragma omp barrier 302 | double start = omp_get_wtime(); 303 | PUSH_RANGE("Jacobi solve", 0) 304 | while (l2_norm > tol && iter < iter_max) { 305 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 306 | 307 | int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1); 308 | int bottom = (dev_id + 1) % num_devices; 309 | calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); 310 | 311 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done[(iter % 2)][bottom], 0)); 312 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done[(iter % 2)][top], 0)); 313 | 314 | jacobi_kernel 315 | <<>>( 316 | a_new[dev_id], a, l2_norm_d, iy_start, iy_end[dev_id], nx, calculate_norm); 317 | CUDA_RT_CALL(cudaGetLastError()); 318 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 319 | 320 | if (calculate_norm) { 321 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), 322 | cudaMemcpyDeviceToHost, compute_stream)); 323 | } 324 | 325 | // Apply periodic boundary conditions need to wait for other threads due to 326 | // std::swap(a_new[dev_id],a); 327 | #pragma omp barrier 328 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); 329 | CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx), 330 | a_new[dev_id] + iy_start * nx, nx * sizeof(real), 331 | cudaMemcpyDeviceToDevice, push_top_stream)); 332 | CUDA_RT_CALL(cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream)); 333 | 334 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); 335 | CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx, 336 | nx * sizeof(real), cudaMemcpyDeviceToDevice, 337 | push_bottom_stream)); 338 | CUDA_RT_CALL( 339 | cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id], push_bottom_stream)); 340 | // Need to wait for other threads as they are reading push_top_done and 341 | // push_bottom_done 342 | #pragma omp barrier 343 | 344 | if (calculate_norm) { 345 | #pragma omp single 346 | { l2_norm = 0.0; } 347 | #pragma omp barrier 348 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 349 | #pragma omp atomic 350 | l2_norm += *(l2_norm_h); 351 | #pragma omp barrier 352 | #pragma omp single 353 | { l2_norm = std::sqrt(l2_norm); } 354 | #pragma omp barrier 355 | 356 | if (!csv && (iter % 100) == 0) { 357 | #pragma omp master 358 | printf("%5d, %0.6f\n", iter, l2_norm); 359 | } 360 | } 361 | 362 | std::swap(a_new[dev_id], a); 363 | iter++; 364 | } 365 | CUDA_RT_CALL(cudaDeviceSynchronize()); 366 | #pragma omp barrier 367 | double stop = omp_get_wtime(); 368 | POP_RANGE 369 | 370 | CUDA_RT_CALL( 371 | cudaMemcpy(a_h + iy_start_global * nx, a + nx, 372 | std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), 373 | cudaMemcpyDeviceToHost)); 374 | #pragma omp barrier 375 | 376 | #pragma omp master 377 | { 378 | result_correct = true; 379 | for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) { 380 | for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { 381 | if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { 382 | fprintf(stderr, 383 | "ERROR: a[%d * %d + %d] = %f does not match %f " 384 | "(reference)\n", 385 | iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); 386 | result_correct = false; 387 | } 388 | } 389 | } 390 | if (result_correct) { 391 | if (csv) { 392 | printf("multi_threaded_copy, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, 393 | nccheck, num_devices, (stop - start), runtime_serial); 394 | } else { 395 | printf("Num GPUs: %d.\n", num_devices); 396 | printf( 397 | "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: " 398 | "%8.2f, " 399 | "efficiency: %8.2f \n", 400 | ny, nx, runtime_serial, num_devices, (stop - start), 401 | runtime_serial / (stop - start), 402 | runtime_serial / (num_devices * (stop - start)) * 100); 403 | } 404 | } 405 | } 406 | 407 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[1][dev_id])); 408 | CUDA_RT_CALL(cudaEventDestroy(push_top_done[1][dev_id])); 409 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[0][dev_id])); 410 | CUDA_RT_CALL(cudaEventDestroy(push_top_done[0][dev_id])); 411 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 412 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 413 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 414 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 415 | 416 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 417 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 418 | 419 | CUDA_RT_CALL(cudaFree(a_new[dev_id])); 420 | CUDA_RT_CALL(cudaFree(a)); 421 | 422 | if (0 == dev_id) { 423 | CUDA_RT_CALL(cudaFreeHost(a_h)); 424 | CUDA_RT_CALL(cudaFreeHost(a_ref_h)); 425 | } 426 | } 427 | 428 | return result_correct ? 0 : 1; 429 | } 430 | 431 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 432 | const int nccheck, const bool print) { 433 | real* a; 434 | real* a_new; 435 | 436 | cudaStream_t compute_stream; 437 | cudaStream_t push_top_stream; 438 | cudaStream_t push_bottom_stream; 439 | cudaEvent_t compute_done; 440 | cudaEvent_t push_top_done; 441 | cudaEvent_t push_bottom_done; 442 | 443 | real* l2_norm_d; 444 | real* l2_norm_h; 445 | 446 | int iy_start = 1; 447 | int iy_end = (ny - 1); 448 | 449 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); 450 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); 451 | 452 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 453 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 454 | 455 | // Set diriclet boundary conditions on left and right boarder 456 | initialize_boundaries<<>>(a, a_new, PI, 0, nx, ny, ny); 457 | CUDA_RT_CALL(cudaGetLastError()); 458 | CUDA_RT_CALL(cudaDeviceSynchronize()); 459 | 460 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 461 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 462 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 463 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 464 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 465 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 466 | 467 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 468 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 469 | 470 | CUDA_RT_CALL(cudaDeviceSynchronize()); 471 | 472 | if (print) 473 | printf( 474 | "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " 475 | "norm " 476 | "check every %d iterations\n", 477 | iter_max, ny, nx, nccheck); 478 | 479 | constexpr int dim_block_x = 32; 480 | constexpr int dim_block_y = 32; 481 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1); 482 | 483 | int iter = 0; 484 | bool calculate_norm = true; 485 | real l2_norm = 1.0; 486 | 487 | double start = omp_get_wtime(); 488 | PUSH_RANGE("Jacobi solve", 0) 489 | while (l2_norm > tol && iter < iter_max) { 490 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 491 | 492 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); 493 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); 494 | 495 | calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0)); 496 | jacobi_kernel 497 | <<>>( 498 | a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm); 499 | CUDA_RT_CALL(cudaGetLastError()); 500 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 501 | 502 | if (calculate_norm) { 503 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 504 | compute_stream)); 505 | } 506 | 507 | // Apply periodic boundary conditions 508 | 509 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); 510 | CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), 511 | cudaMemcpyDeviceToDevice, push_top_stream)); 512 | CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); 513 | 514 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); 515 | CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), 516 | cudaMemcpyDeviceToDevice, compute_stream)); 517 | CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); 518 | 519 | if (calculate_norm) { 520 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 521 | l2_norm = *l2_norm_h; 522 | l2_norm = std::sqrt(l2_norm); 523 | if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); 524 | } 525 | 526 | std::swap(a_new, a); 527 | iter++; 528 | } 529 | POP_RANGE 530 | double stop = omp_get_wtime(); 531 | 532 | CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); 533 | 534 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 535 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 536 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 537 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 538 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 539 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 540 | 541 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 542 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 543 | 544 | CUDA_RT_CALL(cudaFree(a_new)); 545 | CUDA_RT_CALL(cudaFree(a)); 546 | return (stop - start); 547 | } 548 | -------------------------------------------------------------------------------- /multi_threaded_copy_overlap/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /multi_threaded_p2p/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /multi_threaded_p2p/jacobi.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #ifdef HAVE_CUB 37 | #include 38 | #endif // HAVE_CUB 39 | 40 | #ifdef USE_NVTX 41 | #include 42 | 43 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 44 | 0x0000ffff, 0x00ff0000, 0x00ffffff}; 45 | const int num_colors = sizeof(colors) / sizeof(uint32_t); 46 | 47 | #define PUSH_RANGE(name, cid) \ 48 | { \ 49 | int color_id = cid; \ 50 | color_id = color_id % num_colors; \ 51 | nvtxEventAttributes_t eventAttrib = {0}; \ 52 | eventAttrib.version = NVTX_VERSION; \ 53 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 54 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 55 | eventAttrib.color = colors[color_id]; \ 56 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 57 | eventAttrib.message.ascii = name; \ 58 | nvtxRangePushEx(&eventAttrib); \ 59 | } 60 | #define POP_RANGE nvtxRangePop(); 61 | #else 62 | #define PUSH_RANGE(name, cid) 63 | #define POP_RANGE 64 | #endif 65 | 66 | #define CUDA_RT_CALL(call) \ 67 | { \ 68 | cudaError_t cudaStatus = call; \ 69 | if (cudaSuccess != cudaStatus) { \ 70 | fprintf(stderr, \ 71 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 72 | "with " \ 73 | "%s (%d).\n", \ 74 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 75 | exit( cudaStatus ); \ 76 | } \ 77 | } 78 | 79 | constexpr int MAX_NUM_DEVICES = 32; 80 | 81 | typedef float real; 82 | constexpr real tol = 1.0e-8; 83 | 84 | const real PI = 2.0 * std::asin(1.0); 85 | 86 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 87 | const real pi, const int offset, const int nx, 88 | const int my_ny, const int ny) { 89 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 90 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 91 | a[iy * nx + 0] = y0; 92 | a[iy * nx + (nx - 1)] = y0; 93 | a_new[iy * nx + 0] = y0; 94 | a_new[iy * nx + (nx - 1)] = y0; 95 | } 96 | } 97 | 98 | template 99 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 100 | real* __restrict__ const l2_norm, const int iy_start, 101 | const int iy_end, const int nx, real* __restrict__ const a_new_top, 102 | const int top_iy, real* __restrict__ const a_new_bottom, 103 | const int bottom_iy, const bool calculate_norm) { 104 | #ifdef HAVE_CUB 105 | typedef cub::BlockReduce 106 | BlockReduce; 107 | __shared__ typename BlockReduce::TempStorage temp_storage; 108 | #endif // HAVE_CUB 109 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 110 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 111 | real local_l2_norm = 0.0; 112 | 113 | if (iy < iy_end && ix < (nx - 1)) { 114 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 115 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 116 | a_new[iy * nx + ix] = new_val; 117 | 118 | if (iy_start == iy) { 119 | a_new_top[top_iy * nx + ix] = new_val; 120 | } 121 | 122 | if ((iy_end - 1) == iy) { 123 | a_new_bottom[bottom_iy * nx + ix] = new_val; 124 | } 125 | 126 | if (calculate_norm) { 127 | real residue = new_val - a[iy * nx + ix]; 128 | local_l2_norm += residue * residue; 129 | } 130 | } 131 | if (calculate_norm) { 132 | #ifdef HAVE_CUB 133 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 134 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 135 | #else 136 | atomicAdd(l2_norm, local_l2_norm); 137 | #endif // HAVE_CUB 138 | } 139 | } 140 | 141 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 142 | const int nccheck, const bool print); 143 | 144 | template 145 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { 146 | T argval = default_val; 147 | char** itr = std::find(begin, end, arg); 148 | if (itr != end && ++itr != end) { 149 | std::istringstream inbuf(*itr); 150 | inbuf >> argval; 151 | } 152 | return argval; 153 | } 154 | 155 | bool get_arg(char** begin, char** end, const std::string& arg) { 156 | char** itr = std::find(begin, end, arg); 157 | if (itr != end) { 158 | return true; 159 | } 160 | return false; 161 | } 162 | 163 | int main(int argc, char* argv[]) { 164 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 165 | const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); 166 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 167 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 168 | const bool csv = get_arg(argv, argv + argc, "-csv"); 169 | 170 | real* a_new[MAX_NUM_DEVICES]; 171 | 172 | real* a_ref_h; 173 | real* a_h; 174 | double runtime_serial = 0.0; 175 | 176 | int iy_end[MAX_NUM_DEVICES]; 177 | 178 | cudaEvent_t compute_done[2][MAX_NUM_DEVICES]; 179 | 180 | bool result_correct = true; 181 | bool p2p_works = true; 182 | int num_devices = 0; 183 | CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); 184 | real l2_norm = 1.0; 185 | #pragma omp parallel num_threads(num_devices) shared(l2_norm) 186 | { 187 | real* a; 188 | 189 | cudaStream_t compute_stream; 190 | cudaStream_t push_top_stream; 191 | cudaStream_t push_bottom_stream; 192 | cudaEvent_t push_top_done; 193 | cudaEvent_t push_bottom_done; 194 | 195 | real* l2_norm_d; 196 | real* l2_norm_h; 197 | 198 | int dev_id = omp_get_thread_num(); 199 | 200 | CUDA_RT_CALL(cudaSetDevice(dev_id)); 201 | CUDA_RT_CALL(cudaFree(0)); 202 | 203 | if (0 == dev_id) { 204 | CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); 205 | CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); 206 | runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv); 207 | } 208 | #pragma omp barrier 209 | // ny - 2 rows are distributed amongst `size` ranks in such a way 210 | // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. 211 | // This optimizes load balancing when (ny - 2) % size != 0 212 | int chunk_size; 213 | int chunk_size_low = (ny - 2) / num_devices; 214 | int chunk_size_high = chunk_size_low + 1; 215 | // To calculate the number of ranks that need to compute an extra row, 216 | // the following formula is derived from this equation: 217 | // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 218 | int num_ranks_low = num_devices * chunk_size_low + num_devices - 219 | (ny - 2); // Number of ranks with chunk_size = chunk_size_low 220 | if (dev_id < num_ranks_low) 221 | chunk_size = chunk_size_low; 222 | else 223 | chunk_size = chunk_size_high; 224 | 225 | const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1); 226 | const int bottom = (dev_id + 1) % num_devices; 227 | if (top != dev_id) { 228 | int canAccessPeer = 0; 229 | CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top)); 230 | if (canAccessPeer) { 231 | CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0)); 232 | } else { 233 | std::cerr << "P2P access required from " << dev_id << " to " << top << std::endl; 234 | #pragma omp critical 235 | { 236 | if (p2p_works) p2p_works = false; 237 | } 238 | } 239 | if (top != bottom) { 240 | canAccessPeer = 0; 241 | CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom)); 242 | if (canAccessPeer) { 243 | CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0)); 244 | } else { 245 | std::cerr << "P2P access required from " << dev_id << " to " << bottom 246 | << std::endl; 247 | #pragma omp critical 248 | { 249 | if (p2p_works) p2p_works = false; 250 | } 251 | } 252 | } 253 | } 254 | 255 | #pragma omp barrier 256 | 257 | if (p2p_works) { 258 | CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); 259 | CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size + 2) * sizeof(real))); 260 | 261 | CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); 262 | CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size + 2) * sizeof(real))); 263 | 264 | // Calculate local domain boundaries 265 | int iy_start_global; // My start index in the global array 266 | if (dev_id < num_ranks_low) { 267 | iy_start_global = dev_id * chunk_size_low + 1; 268 | } else { 269 | iy_start_global = 270 | num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1; 271 | } 272 | int iy_end_global = 273 | iy_start_global + chunk_size - 1; // My last index in the global array 274 | 275 | int iy_start = 1; 276 | iy_end[dev_id] = (iy_end_global - iy_start_global + 1) + iy_start; 277 | 278 | // Set diriclet boundary conditions on left and right boarder 279 | initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>( 280 | a, a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size + 2), ny); 281 | CUDA_RT_CALL(cudaGetLastError()); 282 | CUDA_RT_CALL(cudaDeviceSynchronize()); 283 | 284 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 285 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 286 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 287 | CUDA_RT_CALL( 288 | cudaEventCreateWithFlags(compute_done[0] + dev_id, cudaEventDisableTiming)); 289 | CUDA_RT_CALL( 290 | cudaEventCreateWithFlags(compute_done[1] + dev_id, cudaEventDisableTiming)); 291 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 292 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 293 | 294 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 295 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 296 | 297 | CUDA_RT_CALL(cudaDeviceSynchronize()); 298 | 299 | #pragma omp master 300 | { 301 | if (!csv) 302 | printf( 303 | "Jacobi relaxation: %d iterations on %d x %d mesh with " 304 | "norm " 305 | "check every %d iterations\n", 306 | iter_max, ny, nx, nccheck); 307 | } 308 | 309 | constexpr int dim_block_x = 32; 310 | constexpr int dim_block_y = 32; 311 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 312 | (ny + (num_devices * dim_block_y) - 1) / (num_devices * dim_block_y), 1); 313 | 314 | int iter = 0; 315 | bool calculate_norm = true; 316 | #pragma omp master 317 | { l2_norm = 1.0; } 318 | 319 | CUDA_RT_CALL(cudaDeviceSynchronize()); 320 | #pragma omp barrier 321 | double start = omp_get_wtime(); 322 | PUSH_RANGE("Jacobi solve", 0) 323 | while (l2_norm > tol && iter < iter_max) { 324 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 325 | 326 | // need to wait for other threads due to sharing of a_new and compute_done 327 | // between threads 328 | #pragma omp barrier 329 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, compute_done[iter % 2][top], 0)); 330 | CUDA_RT_CALL( 331 | cudaStreamWaitEvent(compute_stream, compute_done[iter % 2][bottom], 0)); 332 | 333 | calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); 334 | jacobi_kernel 335 | <<>>( 336 | a_new[dev_id], a, l2_norm_d, iy_start, iy_end[dev_id], nx, a_new[top], 337 | iy_end[top], a_new[bottom], 0, calculate_norm); 338 | CUDA_RT_CALL(cudaGetLastError()); 339 | CUDA_RT_CALL(cudaEventRecord(compute_done[(iter + 1) % 2][dev_id], compute_stream)); 340 | 341 | if (calculate_norm) { 342 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), 343 | cudaMemcpyDeviceToHost, compute_stream)); 344 | #pragma omp barrier 345 | #pragma omp single 346 | { l2_norm = 0.0; } 347 | #pragma omp barrier 348 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 349 | #pragma omp atomic 350 | l2_norm += *(l2_norm_h); 351 | #pragma omp barrier 352 | #pragma omp single 353 | { l2_norm = std::sqrt(l2_norm); } 354 | #pragma omp barrier 355 | if (!csv && (iter % 100) == 0) { 356 | #pragma omp master 357 | printf("%5d, %0.6f\n", iter, l2_norm); 358 | } 359 | } 360 | 361 | #pragma omp barrier 362 | std::swap(a_new[dev_id], a); 363 | iter++; 364 | } 365 | CUDA_RT_CALL(cudaDeviceSynchronize()); 366 | #pragma omp barrier 367 | double stop = omp_get_wtime(); 368 | POP_RANGE 369 | 370 | CUDA_RT_CALL( 371 | cudaMemcpy(a_h + iy_start_global * nx, a + nx, 372 | std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), 373 | cudaMemcpyDeviceToHost)); 374 | #pragma omp barrier 375 | 376 | #pragma omp master 377 | { 378 | result_correct = true; 379 | for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) { 380 | for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { 381 | if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { 382 | fprintf(stderr, 383 | "ERROR: a[%d * %d + %d] = %f does not " 384 | "match %f (reference)\n", 385 | iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); 386 | result_correct = false; 387 | } 388 | } 389 | } 390 | if (result_correct) { 391 | if (csv) { 392 | printf( 393 | "multi_threaded_p2p, %d, %d, %d, %d, %d, 1, %f, " 394 | "%f\n", 395 | nx, ny, iter_max, nccheck, num_devices, (stop - start), runtime_serial); 396 | } else { 397 | printf("Num GPUs: %d.\n", num_devices); 398 | printf( 399 | "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: " 400 | "%8.2f, " 401 | "efficiency: %8.2f \n", 402 | ny, nx, runtime_serial, num_devices, (stop - start), 403 | runtime_serial / (stop - start), 404 | runtime_serial / (num_devices * (stop - start)) * 100); 405 | } 406 | } 407 | } 408 | 409 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 410 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 411 | CUDA_RT_CALL(cudaEventDestroy(compute_done[1][dev_id])); 412 | CUDA_RT_CALL(cudaEventDestroy(compute_done[0][dev_id])); 413 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 414 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 415 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 416 | 417 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 418 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 419 | 420 | CUDA_RT_CALL(cudaFree(a_new[dev_id])); 421 | CUDA_RT_CALL(cudaFree(a)); 422 | if (0 == dev_id) { 423 | CUDA_RT_CALL(cudaFreeHost(a_h)); 424 | CUDA_RT_CALL(cudaFreeHost(a_ref_h)); 425 | } 426 | } 427 | } 428 | 429 | return result_correct ? 0 : 1; 430 | } 431 | 432 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, 433 | const int nccheck, const bool print) { 434 | real* a; 435 | real* a_new; 436 | 437 | cudaStream_t compute_stream; 438 | cudaStream_t push_top_stream; 439 | cudaStream_t push_bottom_stream; 440 | cudaEvent_t compute_done; 441 | cudaEvent_t push_top_done; 442 | cudaEvent_t push_bottom_done; 443 | 444 | real* l2_norm_d; 445 | real* l2_norm_h; 446 | 447 | int iy_start = 1; 448 | int iy_end = (ny - 1); 449 | 450 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); 451 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); 452 | 453 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 454 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 455 | 456 | // Set diriclet boundary conditions on left and right boarder 457 | initialize_boundaries<<>>(a, a_new, PI, 0, nx, ny, ny); 458 | CUDA_RT_CALL(cudaGetLastError()); 459 | CUDA_RT_CALL(cudaDeviceSynchronize()); 460 | 461 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 462 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 463 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 464 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 465 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 466 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 467 | 468 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 469 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 470 | 471 | CUDA_RT_CALL(cudaDeviceSynchronize()); 472 | 473 | if (print) 474 | printf( 475 | "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " 476 | "norm " 477 | "check every %d iterations\n", 478 | iter_max, ny, nx, nccheck); 479 | 480 | constexpr int dim_block_x = 32; 481 | constexpr int dim_block_y = 32; 482 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1); 483 | 484 | int iter = 0; 485 | bool calculate_norm = true; 486 | real l2_norm = 1.0; 487 | 488 | CUDA_RT_CALL(cudaDeviceSynchronize()); 489 | double start = omp_get_wtime(); 490 | PUSH_RANGE("Jacobi solve", 0) 491 | while (l2_norm > tol && iter < iter_max) { 492 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 493 | 494 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); 495 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); 496 | 497 | calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0)); 498 | jacobi_kernel 499 | <<>>( 500 | a_new, a, l2_norm_d, iy_start, iy_end, nx, a_new, iy_start, a_new, (iy_end - 1), 501 | calculate_norm); 502 | CUDA_RT_CALL(cudaGetLastError()); 503 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 504 | 505 | if (calculate_norm) { 506 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 507 | compute_stream)); 508 | } 509 | 510 | // Apply periodic boundary conditions 511 | 512 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); 513 | CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), 514 | cudaMemcpyDeviceToDevice, push_top_stream)); 515 | CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); 516 | 517 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); 518 | CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), 519 | cudaMemcpyDeviceToDevice, compute_stream)); 520 | CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); 521 | 522 | if (calculate_norm) { 523 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 524 | l2_norm = *l2_norm_h; 525 | l2_norm = std::sqrt(l2_norm); 526 | if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); 527 | } 528 | 529 | std::swap(a_new, a); 530 | iter++; 531 | } 532 | CUDA_RT_CALL(cudaDeviceSynchronize()); 533 | POP_RANGE 534 | double stop = omp_get_wtime(); 535 | 536 | CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); 537 | 538 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 539 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 540 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 541 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 542 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 543 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 544 | 545 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 546 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 547 | 548 | CUDA_RT_CALL(cudaFree(a_new)); 549 | CUDA_RT_CALL(cudaFree(a)); 550 | return (stop - start); 551 | } 552 | -------------------------------------------------------------------------------- /multi_threaded_p2p_opt/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /multi_threaded_um/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /multi_threaded_um/jacobi.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #ifdef HAVE_CUB 37 | #include 38 | #endif // HAVE_CUB 39 | 40 | #ifdef USE_NVTX 41 | #include 42 | 43 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 44 | 0x0000ffff, 0x00ff0000, 0x00ffffff}; 45 | const int num_colors = sizeof(colors) / sizeof(uint32_t); 46 | 47 | #define PUSH_RANGE(name, cid) \ 48 | { \ 49 | int color_id = cid; \ 50 | color_id = color_id % num_colors; \ 51 | nvtxEventAttributes_t eventAttrib = {0}; \ 52 | eventAttrib.version = NVTX_VERSION; \ 53 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 54 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 55 | eventAttrib.color = colors[color_id]; \ 56 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 57 | eventAttrib.message.ascii = name; \ 58 | nvtxRangePushEx(&eventAttrib); \ 59 | } 60 | #define POP_RANGE nvtxRangePop(); 61 | #else 62 | #define PUSH_RANGE(name, cid) 63 | #define POP_RANGE 64 | #endif 65 | 66 | #define CUDA_RT_CALL(call) \ 67 | { \ 68 | cudaError_t cudaStatus = call; \ 69 | if (cudaSuccess != cudaStatus) { \ 70 | fprintf(stderr, \ 71 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 72 | "with " \ 73 | "%s (%d).\n", \ 74 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 75 | exit( cudaStatus ); \ 76 | } \ 77 | } 78 | 79 | constexpr int MAX_NUM_DEVICES = 32; 80 | 81 | typedef float real; 82 | constexpr real tol = 1.0e-8; 83 | 84 | const real PI = 2.0 * std::asin(1.0); 85 | 86 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 87 | const real pi, const int nx, const int ny) { 88 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < ny; iy += blockDim.x * gridDim.x) { 89 | const real y0 = sin(2.0 * pi * iy / (ny - 1)); 90 | a[iy * nx + 0] = y0; 91 | a[iy * nx + (nx - 1)] = y0; 92 | a_new[iy * nx + 0] = y0; 93 | a_new[iy * nx + (nx - 1)] = y0; 94 | } 95 | } 96 | 97 | template 98 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 99 | real* __restrict__ const l2_norm, const int iy_start, 100 | const int iy_end, const int nx, const int ny, 101 | const bool calculate_norm) { 102 | #ifdef HAVE_CUB 103 | typedef cub::BlockReduce 104 | BlockReduce; 105 | __shared__ typename BlockReduce::TempStorage temp_storage; 106 | #endif // HAVE_CUB 107 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 108 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 109 | real local_l2_norm = 0.0; 110 | 111 | if (iy < iy_end && ix < (nx - 1)) { 112 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 113 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 114 | a_new[iy * nx + ix] = new_val; 115 | if (1 == iy) { 116 | a_new[(ny - 1) * nx + ix] = new_val; 117 | } 118 | if ((ny - 2) == iy) { 119 | a_new[0 * nx + ix] = new_val; 120 | } 121 | 122 | if (calculate_norm) { 123 | real residue = new_val - a[iy * nx + ix]; 124 | local_l2_norm += residue * residue; 125 | } 126 | } 127 | if (calculate_norm) { 128 | #ifdef HAVE_CUB 129 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 130 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 131 | #else 132 | atomicAdd(l2_norm, local_l2_norm); 133 | #endif // HAVE_CUB 134 | } 135 | } 136 | 137 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref, 138 | const int nccheck, const bool print); 139 | 140 | template 141 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { 142 | T argval = default_val; 143 | char** itr = std::find(begin, end, arg); 144 | if (itr != end && ++itr != end) { 145 | std::istringstream inbuf(*itr); 146 | inbuf >> argval; 147 | } 148 | return argval; 149 | } 150 | 151 | bool get_arg(char** begin, char** end, const std::string& arg) { 152 | char** itr = std::find(begin, end, arg); 153 | if (itr != end) { 154 | return true; 155 | } 156 | return false; 157 | } 158 | 159 | int main(int argc, char* argv[]) { 160 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 161 | const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); 162 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 163 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 164 | const bool csv = get_arg(argv, argv + argc, "-csv"); 165 | 166 | real* a; 167 | real* a_new; 168 | 169 | real* a_ref; 170 | double runtime_serial = 0.0; 171 | 172 | CUDA_RT_CALL(cudaSetDevice(0)); 173 | CUDA_RT_CALL(cudaFree(0)); 174 | 175 | CUDA_RT_CALL(cudaMallocManaged(&a_ref, nx * ny * sizeof(real))); 176 | runtime_serial = single_gpu(nx, ny, iter_max, a_ref, nccheck, !csv); 177 | 178 | CUDA_RT_CALL(cudaMallocManaged(&a, nx * ny * sizeof(real))); 179 | CUDA_RT_CALL(cudaMallocManaged(&a_new, nx * ny * sizeof(real))); 180 | 181 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 182 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 183 | 184 | // Set diriclet boundary conditions on left and right boarder 185 | initialize_boundaries<<>>(a, a_new, PI, nx, ny); 186 | CUDA_RT_CALL(cudaGetLastError()); 187 | CUDA_RT_CALL(cudaDeviceSynchronize()); 188 | 189 | if (!csv) 190 | printf( 191 | "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " 192 | "every %d iterations\n", 193 | iter_max, ny, nx, nccheck); 194 | 195 | real l2_norm = 1.0; 196 | 197 | cudaEvent_t compute_done[2][MAX_NUM_DEVICES]; 198 | 199 | bool result_correct = true; 200 | int num_devices = 0; 201 | CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); 202 | #pragma omp parallel num_threads(num_devices) firstprivate(a, a_new) 203 | { 204 | int dev_id = omp_get_thread_num(); 205 | 206 | CUDA_RT_CALL(cudaSetDevice(dev_id)); 207 | CUDA_RT_CALL(cudaFree(0)); 208 | 209 | // ny - 2 rows are distributed amongst `size` ranks in such a way 210 | // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. 211 | // This optimizes load balancing when (ny - 2) % size != 0 212 | int chunk_size; 213 | int chunk_size_low = ny / num_devices; 214 | int chunk_size_high = chunk_size_low + 1; 215 | // To calculate the number of ranks that need to compute an extra row, 216 | // the following formula is derived from this equation: 217 | // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 218 | int num_ranks_low = num_devices * chunk_size_low + num_devices - 219 | ny; // Number of ranks with chunk_size = chunk_size_low 220 | if (dev_id < num_ranks_low) 221 | chunk_size = chunk_size_low; 222 | else 223 | chunk_size = chunk_size_high; 224 | 225 | // Calculate local domain boundaries 226 | int iy_start; 227 | if (dev_id < num_ranks_low) { 228 | iy_start = dev_id * chunk_size_low; 229 | } else { 230 | iy_start = num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high; 231 | } 232 | int iy_end = iy_start + chunk_size; 233 | // Do not process boundaries 234 | iy_start = std::max(iy_start, 1); 235 | iy_end = std::min(iy_end, ny - 1); 236 | 237 | const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1); 238 | int canAccessPeer = 0; 239 | CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top)); 240 | if (canAccessPeer) { 241 | CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0)); 242 | } 243 | const int bottom = (dev_id + 1) % num_devices; 244 | canAccessPeer = 0; 245 | CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom)); 246 | if (top != bottom && canAccessPeer) { 247 | CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0)); 248 | } 249 | 250 | #ifdef UM_HINTS 251 | CUDA_RT_CALL(cudaMemAdvise(a + iy_start * nx, (iy_end - iy_start) * nx * sizeof(real), 252 | cudaMemAdviseSetPreferredLocation, dev_id)); 253 | CUDA_RT_CALL(cudaMemAdvise(a + (iy_start - 1) * nx, nx * sizeof(real), 254 | cudaMemAdviseSetAccessedBy, dev_id)); 255 | CUDA_RT_CALL( 256 | cudaMemAdvise(a + iy_end * nx, nx * sizeof(real), cudaMemAdviseSetAccessedBy, dev_id)); 257 | CUDA_RT_CALL(cudaMemAdvise(a_new + iy_start * nx, (iy_end - iy_start) * nx * sizeof(real), 258 | cudaMemAdviseSetPreferredLocation, dev_id)); 259 | CUDA_RT_CALL(cudaMemAdvise(a_new + (iy_start - 1) * nx, nx * sizeof(real), 260 | cudaMemAdviseSetAccessedBy, dev_id)); 261 | CUDA_RT_CALL(cudaMemAdvise(a_new + iy_end * nx, nx * sizeof(real), 262 | cudaMemAdviseSetAccessedBy, dev_id)); 263 | #endif // UM_HINTS 264 | 265 | constexpr int dim_block_x = 32; 266 | constexpr int dim_block_y = 32; 267 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 268 | (ny + (num_devices * dim_block_y) - 1) / (num_devices * dim_block_y), 1); 269 | 270 | real* l2_norm_d; 271 | real* l2_norm_h; 272 | 273 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 274 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 275 | 276 | CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done[0] + dev_id, cudaEventDisableTiming)); 277 | CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done[1] + dev_id, cudaEventDisableTiming)); 278 | 279 | CUDA_RT_CALL(cudaDeviceSynchronize()); 280 | 281 | int iter = 0; 282 | bool calculate_norm = true; 283 | #pragma omp master 284 | { l2_norm = 1.0; } 285 | 286 | #pragma omp barrier 287 | double start = omp_get_wtime(); 288 | PUSH_RANGE("Jacobi solve", 0) 289 | while (l2_norm > tol && iter < iter_max) { 290 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), 0)); 291 | 292 | // need to wait for other threads due to sharing of a, a_new and compute_done 293 | // between threads 294 | #pragma omp barrier 295 | CUDA_RT_CALL(cudaStreamWaitEvent(0, compute_done[iter % 2][top], 0)); 296 | CUDA_RT_CALL(cudaStreamWaitEvent(0, compute_done[iter % 2][bottom], 0)); 297 | 298 | calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); 299 | jacobi_kernel<<>>( 300 | a_new, a, l2_norm_d, iy_start, iy_end, nx, ny, calculate_norm); 301 | CUDA_RT_CALL(cudaGetLastError()); 302 | CUDA_RT_CALL(cudaEventRecord(compute_done[(iter + 1) % 2][dev_id], 0)); 303 | #pragma omp barrier 304 | 305 | if (calculate_norm) { 306 | CUDA_RT_CALL( 307 | cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 0)); 308 | #pragma omp barrier 309 | #pragma omp single 310 | { l2_norm = 0.0; } 311 | #pragma omp barrier 312 | CUDA_RT_CALL(cudaStreamSynchronize(0)); 313 | #pragma omp atomic 314 | l2_norm += *(l2_norm_h); 315 | #pragma omp barrier 316 | #pragma omp single 317 | { l2_norm = std::sqrt(l2_norm); } 318 | #pragma omp barrier 319 | if (!csv && (iter % 100) == 0) { 320 | #pragma omp master 321 | printf("%5d, %0.6f\n", iter, l2_norm); 322 | } 323 | } 324 | 325 | std::swap(a_new, a); 326 | iter++; 327 | } 328 | CUDA_RT_CALL(cudaDeviceSynchronize()); 329 | #pragma omp barrier 330 | POP_RANGE 331 | double stop = omp_get_wtime(); 332 | 333 | #pragma omp barrier 334 | 335 | #pragma omp master 336 | { 337 | result_correct = true; 338 | for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) { 339 | for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { 340 | if (std::fabs(a_ref[iy * nx + ix] - a[iy * nx + ix]) > tol) { 341 | fprintf(stderr, 342 | "ERROR: a[%d * %d + %d] = %f does not match %f " 343 | "(reference)\n", 344 | iy, nx, ix, a[iy * nx + ix], a_ref[iy * nx + ix]); 345 | result_correct = false; 346 | } 347 | } 348 | } 349 | if (result_correct) { 350 | if (csv) { 351 | printf("multi_threaded_um, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, 352 | nccheck, num_devices, (stop - start), runtime_serial); 353 | } else { 354 | printf("Num GPUs: %d.\n", num_devices); 355 | printf( 356 | "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: " 357 | "%8.2f, " 358 | "efficiency: %8.2f \n", 359 | ny, nx, runtime_serial, num_devices, (stop - start), 360 | runtime_serial / (stop - start), 361 | runtime_serial / (num_devices * (stop - start)) * 100); 362 | } 363 | } 364 | } 365 | 366 | CUDA_RT_CALL(cudaEventDestroy(compute_done[1][dev_id])); 367 | CUDA_RT_CALL(cudaEventDestroy(compute_done[0][dev_id])); 368 | 369 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 370 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 371 | CUDA_RT_CALL(cudaDeviceSynchronize()); 372 | } 373 | 374 | CUDA_RT_CALL(cudaFree(a_new)); 375 | CUDA_RT_CALL(cudaFree(a)); 376 | 377 | CUDA_RT_CALL(cudaFree(a_ref)); 378 | 379 | return result_correct ? 0 : 1; 380 | } 381 | 382 | template 383 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 384 | real* __restrict__ const l2_norm, const int iy_start, 385 | const int iy_end, const int nx, const bool calculate_norm) { 386 | #ifdef HAVE_CUB 387 | typedef cub::BlockReduce 388 | BlockReduce; 389 | __shared__ typename BlockReduce::TempStorage temp_storage; 390 | #endif // HAVE_CUB 391 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 392 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 393 | real local_l2_norm = 0.0; 394 | 395 | if (iy < iy_end && ix < (nx - 1)) { 396 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 397 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 398 | a_new[iy * nx + ix] = new_val; 399 | 400 | if (calculate_norm) { 401 | real residue = new_val - a[iy * nx + ix]; 402 | local_l2_norm += residue * residue; 403 | } 404 | } 405 | if (calculate_norm) { 406 | #ifdef HAVE_CUB 407 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 408 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 409 | #else 410 | atomicAdd(l2_norm, local_l2_norm); 411 | #endif // HAVE_CUB 412 | } 413 | } 414 | 415 | double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref, 416 | const int nccheck, const bool print) { 417 | real* a; 418 | real* a_new; 419 | 420 | cudaStream_t compute_stream; 421 | cudaStream_t push_top_stream; 422 | cudaStream_t push_bottom_stream; 423 | cudaEvent_t compute_done; 424 | cudaEvent_t push_top_done; 425 | cudaEvent_t push_bottom_done; 426 | 427 | real* l2_norm_d; 428 | real* l2_norm_h; 429 | 430 | int iy_start = 1; 431 | int iy_end = (ny - 1); 432 | 433 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); 434 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); 435 | 436 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 437 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 438 | 439 | // Set diriclet boundary conditions on left and right boarder 440 | initialize_boundaries<<>>(a, a_new, PI, nx, ny); 441 | CUDA_RT_CALL(cudaGetLastError()); 442 | CUDA_RT_CALL(cudaDeviceSynchronize()); 443 | 444 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 445 | CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); 446 | CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); 447 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 448 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); 449 | CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); 450 | 451 | CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); 452 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); 453 | 454 | CUDA_RT_CALL(cudaDeviceSynchronize()); 455 | 456 | if (print) 457 | printf( 458 | "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " 459 | "norm " 460 | "check every %d iterations\n", 461 | iter_max, ny, nx, nccheck); 462 | 463 | constexpr int dim_block_x = 32; 464 | constexpr int dim_block_y = 32; 465 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1); 466 | 467 | int iter = 0; 468 | bool calculate_norm = true; 469 | real l2_norm = 1.0; 470 | 471 | double start = omp_get_wtime(); 472 | PUSH_RANGE("Jacobi solve", 0) 473 | while (l2_norm > tol && iter < iter_max) { 474 | CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); 475 | 476 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); 477 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); 478 | 479 | calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0)); 480 | 481 | jacobi_kernel 482 | <<>>( 483 | a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm); 484 | CUDA_RT_CALL(cudaGetLastError()); 485 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 486 | 487 | if (calculate_norm) { 488 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, 489 | compute_stream)); 490 | } 491 | 492 | // Apply periodic boundary conditions 493 | 494 | CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); 495 | CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), 496 | cudaMemcpyDeviceToDevice, push_top_stream)); 497 | CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); 498 | 499 | CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); 500 | CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), 501 | cudaMemcpyDeviceToDevice, compute_stream)); 502 | CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); 503 | 504 | if (calculate_norm) { 505 | CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); 506 | l2_norm = *l2_norm_h; 507 | l2_norm = std::sqrt(l2_norm); 508 | if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); 509 | } 510 | 511 | std::swap(a_new, a); 512 | iter++; 513 | } 514 | POP_RANGE 515 | double stop = omp_get_wtime(); 516 | 517 | CUDA_RT_CALL(cudaMemcpy(a_ref, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); 518 | 519 | CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); 520 | CUDA_RT_CALL(cudaEventDestroy(push_top_done)); 521 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 522 | CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); 523 | CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); 524 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 525 | 526 | CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); 527 | CUDA_RT_CALL(cudaFree(l2_norm_d)); 528 | 529 | CUDA_RT_CALL(cudaFree(a_new)); 530 | CUDA_RT_CALL(cudaFree(a)); 531 | return (stop - start); 532 | } 533 | -------------------------------------------------------------------------------- /nccl/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPICXX=mpicxx 5 | MPIRUN ?= mpirun 6 | CUDA_HOME ?= /usr/local/cuda 7 | NCCL_HOME ?= /usr 8 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 9 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 10 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 11 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 12 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 13 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 14 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 15 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 16 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 17 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 18 | ifdef DISABLE_CUB 19 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 20 | else 21 | NVCC_FLAGS = -DHAVE_CUB 22 | endif 23 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 24 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14 25 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib -lcudart -ldl -lnccl 26 | jacobi: Makefile jacobi.cpp jacobi_kernels.o 27 | $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi 28 | 29 | jacobi_kernels.o: Makefile jacobi_kernels.cu 30 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 31 | 32 | .PHONY.: clean 33 | clean: 34 | rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log 35 | 36 | sanitize: jacobi 37 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 38 | 39 | run: jacobi 40 | $(MPIRUN) -np $(NP) ./jacobi 41 | 42 | profile: jacobi 43 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 44 | -------------------------------------------------------------------------------- /nccl/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | 30 | #ifdef HAVE_CUB 31 | #include 32 | #endif // HAVE_CUB 33 | 34 | #define CUDA_RT_CALL(call) \ 35 | { \ 36 | cudaError_t cudaStatus = call; \ 37 | if (cudaSuccess != cudaStatus) { \ 38 | fprintf(stderr, \ 39 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 40 | "with " \ 41 | "%s (%d).\n", \ 42 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 43 | exit( cudaStatus ); \ 44 | } \ 45 | } 46 | 47 | #ifdef USE_DOUBLE 48 | typedef double real; 49 | #define MPI_REAL_TYPE MPI_DOUBLE 50 | #else 51 | typedef float real; 52 | #define MPI_REAL_TYPE MPI_FLOAT 53 | #endif 54 | 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 56 | const real pi, const int offset, const int nx, 57 | const int my_ny, const int ny) { 58 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 59 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 60 | a[iy * nx + 0] = y0; 61 | a[iy * nx + (nx - 1)] = y0; 62 | a_new[iy * nx + 0] = y0; 63 | a_new[iy * nx + (nx - 1)] = y0; 64 | } 65 | } 66 | 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 68 | const real pi, const int offset, const int nx, const int my_ny, 69 | const int ny) { 70 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 71 | CUDA_RT_CALL(cudaGetLastError()); 72 | } 73 | 74 | template 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 76 | real* __restrict__ const l2_norm, const int iy_start, 77 | const int iy_end, const int nx, const bool calculate_norm) { 78 | #ifdef HAVE_CUB 79 | typedef cub::BlockReduce 80 | BlockReduce; 81 | __shared__ typename BlockReduce::TempStorage temp_storage; 82 | #endif // HAVE_CUB 83 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 84 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 85 | real local_l2_norm = 0.0; 86 | 87 | if (iy < iy_end && ix < (nx - 1)) { 88 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 89 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 90 | a_new[iy * nx + ix] = new_val; 91 | if (calculate_norm) { 92 | real residue = new_val - a[iy * nx + ix]; 93 | local_l2_norm += residue * residue; 94 | } 95 | } 96 | if (calculate_norm) { 97 | #ifdef HAVE_CUB 98 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 99 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 100 | #else 101 | atomicAdd(l2_norm, local_l2_norm); 102 | #endif // HAVE_CUB 103 | } 104 | } 105 | 106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 107 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 108 | const int nx, const bool calculate_norm, cudaStream_t stream) { 109 | constexpr int dim_block_x = 32; 110 | constexpr int dim_block_y = 32; 111 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 112 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 113 | jacobi_kernel<<>>( 114 | a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); 115 | CUDA_RT_CALL(cudaGetLastError()); 116 | } 117 | -------------------------------------------------------------------------------- /nccl_graphs/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021,2022, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPICXX=mpicxx 5 | MPIRUN ?= mpirun 6 | CUDA_HOME ?= /usr/local/cuda 7 | NCCL_HOME ?= /usr 8 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 9 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 10 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 11 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 12 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 13 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 14 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 15 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 16 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 17 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 18 | ifdef DISABLE_CUB 19 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 20 | else 21 | NVCC_FLAGS = -DHAVE_CUB 22 | endif 23 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 24 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14 25 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib -lcudart -ldl -lnccl 26 | jacobi: Makefile jacobi.cpp jacobi_kernels.o 27 | $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi 28 | 29 | jacobi_kernels.o: Makefile jacobi_kernels.cu 30 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 31 | 32 | .PHONY.: clean 33 | clean: 34 | rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log 35 | 36 | sanitize: jacobi 37 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 38 | 39 | run: jacobi 40 | $(MPIRUN) -np $(NP) ./jacobi 41 | 42 | profile: jacobi 43 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx --cuda-graph-trace=node -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 44 | -------------------------------------------------------------------------------- /nccl_graphs/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021,2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | 29 | #ifdef HAVE_CUB 30 | #include 31 | #endif // HAVE_CUB 32 | 33 | #define CUDA_RT_CALL(call) \ 34 | { \ 35 | cudaError_t cudaStatus = call; \ 36 | if (cudaSuccess != cudaStatus) { \ 37 | fprintf(stderr, \ 38 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 39 | "with " \ 40 | "%s (%d).\n", \ 41 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 42 | exit( cudaStatus ); \ 43 | } \ 44 | } 45 | 46 | #ifdef USE_DOUBLE 47 | typedef double real; 48 | #define MPI_REAL_TYPE MPI_DOUBLE 49 | #else 50 | typedef float real; 51 | #define MPI_REAL_TYPE MPI_FLOAT 52 | #endif 53 | 54 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 55 | const real pi, const int offset, const int nx, 56 | const int my_ny, const int ny) { 57 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 58 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 59 | a[iy * nx + 0] = y0; 60 | a[iy * nx + (nx - 1)] = y0; 61 | a_new[iy * nx + 0] = y0; 62 | a_new[iy * nx + (nx - 1)] = y0; 63 | } 64 | } 65 | 66 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 67 | const real pi, const int offset, const int nx, const int my_ny, 68 | const int ny) { 69 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 70 | CUDA_RT_CALL(cudaGetLastError()); 71 | } 72 | 73 | template 74 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 75 | real* __restrict__ const l2_norm, const int iy_start, 76 | const int iy_end, const int nx, const bool calculate_norm) { 77 | #ifdef HAVE_CUB 78 | typedef cub::BlockReduce 79 | BlockReduce; 80 | __shared__ typename BlockReduce::TempStorage temp_storage; 81 | #endif // HAVE_CUB 82 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 83 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 84 | real local_l2_norm = 0.0; 85 | 86 | if (iy < iy_end && ix < (nx - 1)) { 87 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 88 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 89 | a_new[iy * nx + ix] = new_val; 90 | if (calculate_norm) { 91 | real residue = new_val - a[iy * nx + ix]; 92 | local_l2_norm += residue * residue; 93 | } 94 | } 95 | if (calculate_norm) { 96 | #ifdef HAVE_CUB 97 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 98 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 99 | #else 100 | atomicAdd(l2_norm, local_l2_norm); 101 | #endif // HAVE_CUB 102 | } 103 | } 104 | 105 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 106 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 107 | const int nx, const bool calculate_norm, cudaStream_t stream) { 108 | constexpr int dim_block_x = 32; 109 | constexpr int dim_block_y = 32; 110 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 111 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 112 | jacobi_kernel<<>>( 113 | a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); 114 | CUDA_RT_CALL(cudaGetLastError()); 115 | } 116 | -------------------------------------------------------------------------------- /nccl_overlap/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPICXX=mpicxx 5 | MPIRUN ?= mpirun 6 | CUDA_HOME ?= /usr/local/cuda 7 | NCCL_HOME ?= /usr 8 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 9 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 10 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 11 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 12 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 13 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 14 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 15 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 16 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 17 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 18 | ifdef DISABLE_CUB 19 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 20 | else 21 | NVCC_FLAGS = -DHAVE_CUB 22 | endif 23 | NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 24 | MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14 25 | LD_FLAGS = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib -lcudart -ldl -lnccl 26 | jacobi: Makefile jacobi.cpp jacobi_kernels.o 27 | $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi 28 | 29 | jacobi_kernels.o: Makefile jacobi_kernels.cu 30 | $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c 31 | 32 | .PHONY.: clean 33 | clean: 34 | rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log 35 | 36 | sanitize: jacobi 37 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 38 | 39 | run: jacobi 40 | $(MPIRUN) -np $(NP) ./jacobi 41 | 42 | profile: jacobi 43 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 44 | -------------------------------------------------------------------------------- /nccl_overlap/jacobi_kernels.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | 30 | #ifdef HAVE_CUB 31 | #include 32 | #endif // HAVE_CUB 33 | 34 | #define CUDA_RT_CALL(call) \ 35 | { \ 36 | cudaError_t cudaStatus = call; \ 37 | if (cudaSuccess != cudaStatus) { \ 38 | fprintf(stderr, \ 39 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 40 | "with " \ 41 | "%s (%d).\n", \ 42 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 43 | exit( cudaStatus ); \ 44 | } \ 45 | } 46 | 47 | #ifdef USE_DOUBLE 48 | typedef double real; 49 | #define MPI_REAL_TYPE MPI_DOUBLE 50 | #else 51 | typedef float real; 52 | #define MPI_REAL_TYPE MPI_FLOAT 53 | #endif 54 | 55 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 56 | const real pi, const int offset, const int nx, 57 | const int my_ny, const int ny) { 58 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { 59 | const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); 60 | a[iy * nx + 0] = y0; 61 | a[iy * nx + (nx - 1)] = y0; 62 | a_new[iy * nx + 0] = y0; 63 | a_new[iy * nx + (nx - 1)] = y0; 64 | } 65 | } 66 | 67 | void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 68 | const real pi, const int offset, const int nx, const int my_ny, 69 | const int ny) { 70 | initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); 71 | CUDA_RT_CALL(cudaGetLastError()); 72 | } 73 | 74 | template 75 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 76 | real* __restrict__ const l2_norm, const int iy_start, 77 | const int iy_end, const int nx, const bool calculate_norm) { 78 | #ifdef HAVE_CUB 79 | typedef cub::BlockReduce 80 | BlockReduce; 81 | __shared__ typename BlockReduce::TempStorage temp_storage; 82 | #endif // HAVE_CUB 83 | int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; 84 | int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; 85 | real local_l2_norm = 0.0; 86 | 87 | if (iy < iy_end && ix < (nx - 1)) { 88 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 89 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 90 | a_new[iy * nx + ix] = new_val; 91 | if (calculate_norm) { 92 | real residue = new_val - a[iy * nx + ix]; 93 | local_l2_norm += residue * residue; 94 | } 95 | } 96 | if (calculate_norm) { 97 | #ifdef HAVE_CUB 98 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 99 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 100 | #else 101 | atomicAdd(l2_norm, local_l2_norm); 102 | #endif // HAVE_CUB 103 | } 104 | } 105 | 106 | void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 107 | real* __restrict__ const l2_norm, const int iy_start, const int iy_end, 108 | const int nx, const bool calculate_norm, cudaStream_t stream) { 109 | constexpr int dim_block_x = 32; 110 | constexpr int dim_block_y = 32; 111 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, 112 | ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); 113 | jacobi_kernel<<>>( 114 | a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); 115 | CUDA_RT_CALL(cudaGetLastError()); 116 | } 117 | -------------------------------------------------------------------------------- /nvshmem/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | NP ?= 1 3 | NVCC=nvcc 4 | MPIRUN ?= mpirun 5 | CUDA_HOME ?= /usr/local/cuda 6 | ifndef NVSHMEM_HOME 7 | $(error NVSHMEM_HOME is not set) 8 | endif 9 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 10 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 11 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 12 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 13 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 14 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 15 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 16 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 17 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 18 | ifdef USE_LTO 19 | GENCODE_SM70 += -gencode arch=compute_70,code=lto_70 20 | GENCODE_SM80 += -gencode arch=compute_80,code=lto_80 21 | GENCODE_SM90 += -gencode arch=compute_90,code=lto_90 22 | endif 23 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 24 | 25 | ifdef DISABLE_CUB 26 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 27 | else 28 | NVCC_FLAGS = -DHAVE_CUB 29 | endif 30 | NVCC_FLAGS += -ccbin=mpic++ -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include 31 | NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(CUDA_HOME)/lib64 -lcuda -lcudart -ldl -lnvidia-ml 32 | ifdef USE_LTO 33 | NVCC_FLAGS += -maxrregcount=32 34 | NVCC_LDFLAGS += -maxrregcount=32 -dlto 35 | endif 36 | jacobi: Makefile jacobi.cu 37 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o 38 | $(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS) 39 | 40 | .PHONY.: clean 41 | clean: 42 | rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log 43 | 44 | sanitize: jacobi 45 | $(MPIRUN) -np $(NP) compute-sanitizer --log-file jacobi.%q{OMPI_COMM_WORLD_RANK}.compute-sanitizer.log ./jacobi -niter 10 46 | 47 | run: jacobi 48 | $(MPIRUN) -np $(NP) ./jacobi 49 | 50 | profile: jacobi 51 | $(MPIRUN) -np $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{OMPI_COMM_WORLD_RANK} ./jacobi -niter 10 52 | -------------------------------------------------------------------------------- /single_gpu/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /single_gpu/jacobi.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | #ifdef HAVE_CUB 40 | #include 41 | #endif // HAVE_CUB 42 | 43 | #ifdef USE_NVTX 44 | #include 45 | 46 | const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 47 | 0x0000ffff, 0x00ff0000, 0x00ffffff}; 48 | const int num_colors = sizeof(colors) / sizeof(uint32_t); 49 | 50 | #define PUSH_RANGE(name, cid) \ 51 | { \ 52 | int color_id = cid; \ 53 | color_id = color_id % num_colors; \ 54 | nvtxEventAttributes_t eventAttrib = {0}; \ 55 | eventAttrib.version = NVTX_VERSION; \ 56 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 57 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 58 | eventAttrib.color = colors[color_id]; \ 59 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 60 | eventAttrib.message.ascii = name; \ 61 | nvtxRangePushEx(&eventAttrib); \ 62 | } 63 | #define POP_RANGE nvtxRangePop(); 64 | #else 65 | #define PUSH_RANGE(name, cid) 66 | #define POP_RANGE 67 | #endif 68 | 69 | #define CUDA_RT_CALL(call) \ 70 | { \ 71 | cudaError_t cudaStatus = call; \ 72 | if (cudaSuccess != cudaStatus) { \ 73 | fprintf(stderr, \ 74 | "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ 75 | "with " \ 76 | "%s (%d).\n", \ 77 | #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ 78 | exit( cudaStatus ); \ 79 | } \ 80 | } 81 | 82 | typedef float real; 83 | constexpr real tol = 1.0e-8; 84 | 85 | const real PI = 2.0 * std::asin(1.0); 86 | 87 | __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, 88 | const real pi, const int nx, const int ny) { 89 | for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < ny; iy += blockDim.x * gridDim.x) { 90 | const real y0 = sin(2.0 * pi * iy / (ny - 1)); 91 | a[iy * nx + 0] = y0; 92 | a[iy * nx + (nx - 1)] = y0; 93 | a_new[iy * nx + 0] = y0; 94 | a_new[iy * nx + (nx - 1)] = y0; 95 | } 96 | } 97 | 98 | template 99 | __global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, 100 | real* __restrict__ const l2_norm, const int iy_start, 101 | const int iy_end, const int nx) { 102 | #ifdef HAVE_CUB 103 | typedef cub::BlockReduce 104 | BlockReduce; 105 | __shared__ typename BlockReduce::TempStorage temp_storage; 106 | #endif // HAVE_CUB 107 | const int iy = blockIdx.y * blockDim.y + threadIdx.y + 1; 108 | const int ix = blockIdx.x * blockDim.x + threadIdx.x; 109 | real local_l2_norm = 0.0; 110 | 111 | if (iy < iy_end) { 112 | if (ix >= 1 && ix < (nx - 1)) { 113 | const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + 114 | a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); 115 | a_new[iy * nx + ix] = new_val; 116 | 117 | // apply boundary conditions 118 | if (iy_start == iy) { 119 | a_new[iy_end * nx + ix] = new_val; 120 | } 121 | 122 | if ((iy_end - 1) == iy) { 123 | a_new[(iy_start - 1) * nx + ix] = new_val; 124 | } 125 | 126 | real residue = new_val - a[iy * nx + ix]; 127 | local_l2_norm = residue * residue; 128 | } 129 | } 130 | #ifdef HAVE_CUB 131 | real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); 132 | if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); 133 | #else 134 | atomicAdd(l2_norm, local_l2_norm); 135 | #endif // HAVE_CUB 136 | } 137 | 138 | double noopt(const int nx, const int ny, const int iter_max, real* const a_ref_h, const int nccheck, 139 | const bool print); 140 | 141 | template 142 | T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { 143 | T argval = default_val; 144 | char** itr = std::find(begin, end, arg); 145 | if (itr != end && ++itr != end) { 146 | std::istringstream inbuf(*itr); 147 | inbuf >> argval; 148 | } 149 | return argval; 150 | } 151 | 152 | bool get_arg(char** begin, char** end, const std::string& arg) { 153 | char** itr = std::find(begin, end, arg); 154 | if (itr != end) { 155 | return true; 156 | } 157 | return false; 158 | } 159 | 160 | struct l2_norm_buf { 161 | cudaEvent_t copy_done; 162 | real* d; 163 | real* h; 164 | }; 165 | 166 | int main(int argc, char* argv[]) { 167 | const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); 168 | const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); 169 | const int nx = get_argval(argv, argv + argc, "-nx", 16384); 170 | const int ny = get_argval(argv, argv + argc, "-ny", 16384); 171 | const bool csv = get_arg(argv, argv + argc, "-csv"); 172 | 173 | if (nccheck != 1) { 174 | fprintf(stderr, "Only nccheck = 1 is supported\n"); 175 | return -1; 176 | } 177 | 178 | real* a; 179 | real* a_new; 180 | 181 | cudaStream_t compute_stream; 182 | cudaStream_t copy_l2_norm_stream; 183 | cudaStream_t reset_l2_norm_stream; 184 | 185 | cudaEvent_t compute_done; 186 | cudaEvent_t reset_l2_norm_done[2]; 187 | 188 | real l2_norms[2]; 189 | l2_norm_buf l2_norm_bufs[2]; 190 | 191 | int iy_start = 1; 192 | int iy_end = (ny - 1); 193 | 194 | CUDA_RT_CALL(cudaSetDevice(0)); 195 | CUDA_RT_CALL(cudaFree(0)); 196 | 197 | CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); 198 | CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); 199 | 200 | CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); 201 | CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); 202 | 203 | // Set diriclet boundary conditions on left and right boarder 204 | initialize_boundaries<<>>(a, a_new, PI, nx, ny); 205 | CUDA_RT_CALL(cudaGetLastError()); 206 | CUDA_RT_CALL(cudaDeviceSynchronize()); 207 | 208 | CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); 209 | CUDA_RT_CALL(cudaStreamCreate(©_l2_norm_stream)); 210 | CUDA_RT_CALL(cudaStreamCreate(&reset_l2_norm_stream)); 211 | CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); 212 | CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2_norm_done[0], cudaEventDisableTiming)); 213 | CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2_norm_done[1], cudaEventDisableTiming)); 214 | 215 | for (int i = 0; i < 2; ++i) { 216 | CUDA_RT_CALL(cudaEventCreateWithFlags(&l2_norm_bufs[i].copy_done, cudaEventDisableTiming)); 217 | CUDA_RT_CALL(cudaMalloc(&l2_norm_bufs[i].d, sizeof(real))); 218 | CUDA_RT_CALL(cudaMemset(l2_norm_bufs[i].d, 0, sizeof(real))); 219 | CUDA_RT_CALL(cudaMallocHost(&l2_norm_bufs[i].h, sizeof(real))); 220 | (*l2_norm_bufs[i].h) = 1.0; 221 | } 222 | 223 | CUDA_RT_CALL(cudaDeviceSynchronize()); 224 | 225 | if (!csv) 226 | printf( 227 | "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " 228 | "every %d iterations\n", 229 | iter_max, ny, nx, nccheck); 230 | 231 | constexpr int dim_block_x = 32; 232 | constexpr int dim_block_y = 32; 233 | dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1); 234 | 235 | int iter = 0; 236 | for (int i = 0; i < 2; ++i) { 237 | l2_norms[i] = 0.0; 238 | } 239 | 240 | double start = omp_get_wtime(); 241 | 242 | PUSH_RANGE("Jacobi solve", 0) 243 | 244 | bool l2_norm_greater_than_tol = true; 245 | while (l2_norm_greater_than_tol && iter < iter_max) { 246 | // on new iteration: old current vars are now previous vars, old 247 | // previous vars are no longer needed 248 | int prev = iter % 2; 249 | int curr = (iter + 1) % 2; 250 | 251 | // wait for memset from old previous iteration to complete 252 | CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, reset_l2_norm_done[curr], 0)); 253 | 254 | jacobi_kernel 255 | <<>>( 256 | a_new, a, l2_norm_bufs[curr].d, iy_start, iy_end, nx); 257 | CUDA_RT_CALL(cudaGetLastError()); 258 | CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); 259 | 260 | // perform L2 norm calculation 261 | if ((iter % nccheck) == 0 || (!csv && (iter % 100) == 0)) { 262 | CUDA_RT_CALL(cudaStreamWaitEvent(copy_l2_norm_stream, compute_done, 0)); 263 | CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[curr].h, l2_norm_bufs[curr].d, sizeof(real), 264 | cudaMemcpyDeviceToHost, copy_l2_norm_stream)); 265 | CUDA_RT_CALL(cudaEventRecord(l2_norm_bufs[curr].copy_done, copy_l2_norm_stream)); 266 | 267 | // make sure D2H copy is complete before using the data for 268 | // calculation 269 | CUDA_RT_CALL(cudaEventSynchronize(l2_norm_bufs[prev].copy_done)); 270 | 271 | l2_norms[prev] = *(l2_norm_bufs[prev].h); 272 | l2_norms[prev] = std::sqrt(l2_norms[prev]); 273 | l2_norm_greater_than_tol = (l2_norms[prev] > tol); 274 | 275 | if (!csv && (iter % 100) == 0) { 276 | printf("%5d, %0.6f\n", iter, l2_norms[prev]); 277 | } 278 | 279 | // reset everything for next iteration 280 | l2_norms[prev] = 0.0; 281 | *(l2_norm_bufs[prev].h) = 0.0; 282 | CUDA_RT_CALL( 283 | cudaMemsetAsync(l2_norm_bufs[prev].d, 0, sizeof(real), reset_l2_norm_stream)); 284 | CUDA_RT_CALL(cudaEventRecord(reset_l2_norm_done[prev], reset_l2_norm_stream)); 285 | } 286 | 287 | std::swap(a_new, a); 288 | iter++; 289 | } 290 | CUDA_RT_CALL(cudaDeviceSynchronize()); 291 | POP_RANGE 292 | double stop = omp_get_wtime(); 293 | 294 | if (csv) { 295 | printf("single_gpu, %d, %d, %d, %d, %f\n", nx, ny, iter_max, nccheck, (stop - start)); 296 | } else { 297 | printf("%dx%d: 1 GPU: %8.4f s\n", ny, nx, (stop - start)); 298 | } 299 | 300 | for (int i = 0; i < 2; ++i) { 301 | CUDA_RT_CALL(cudaFreeHost(l2_norm_bufs[i].h)); 302 | CUDA_RT_CALL(cudaFree(l2_norm_bufs[i].d)); 303 | CUDA_RT_CALL(cudaEventDestroy(l2_norm_bufs[i].copy_done)); 304 | } 305 | 306 | CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[1])); 307 | CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[0])); 308 | CUDA_RT_CALL(cudaEventDestroy(compute_done)); 309 | 310 | CUDA_RT_CALL(cudaStreamDestroy(reset_l2_norm_stream)); 311 | CUDA_RT_CALL(cudaStreamDestroy(copy_l2_norm_stream)); 312 | CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); 313 | 314 | CUDA_RT_CALL(cudaFree(a_new)); 315 | CUDA_RT_CALL(cudaFree(a)); 316 | 317 | return 0; 318 | } 319 | -------------------------------------------------------------------------------- /single_threaded_copy/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 | NVCC=nvcc 3 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 4 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 5 | GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 6 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 7 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 8 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 9 | GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 10 | GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 11 | GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 12 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) $(GENCODE_SM90) 13 | ifdef DISABLE_CUB 14 | NVCC_FLAGS = -Xptxas --optimize-float-atomics 15 | else 16 | NVCC_FLAGS = -DHAVE_CUB 17 | endif 18 | NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -ldl $(GENCODE_FLAGS) -std=c++14 19 | jacobi: Makefile jacobi.cu 20 | $(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi 21 | 22 | .PHONY.: clean 23 | clean: 24 | rm -f jacobi jacobi.nsys-rep 25 | 26 | sanitize: jacobi 27 | compute-sanitizer ./jacobi -niter 10 28 | 29 | run: jacobi 30 | ./jacobi 31 | 32 | profile: jacobi 33 | nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10 34 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2017,2024, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | now=`date +"%Y%m%d%H%M%S"` 29 | LOG="test-${now}.log" 30 | 31 | if [ -v HPCSDK_RELEASE ]; then 32 | echo "Running with NVIDIA HPC SDK" 33 | 34 | if [ ! -v CUDA_HOME ] || [ ! -d ${CUDA_HOME} ]; then 35 | export CUDA_HOME=$(nvc++ -cuda -printcudaversion |& grep "CUDA Path" | awk -F '=' '{print $2}') 36 | echo "Setting CUDA_HOME=${CUDA_HOME}" 37 | fi 38 | 39 | if [ ! -v NCCL_HOME ] || [ ! -d ${NCCL_HOME} ]; then 40 | export NCCL_HOME=$(dirname `echo $LD_LIBRARY_PATH | tr ':' '\n' | grep nccl | grep -v sharp`) 41 | echo "Setting NCCL_HOME=${NCCL_HOME}" 42 | fi 43 | 44 | if [ ! -v NVSHMEM_HOME ] || [ ! -d ${NVSHMEM_HOME} ]; then 45 | export NVSHMEM_HOME=$(dirname `echo $LD_LIBRARY_PATH | tr ':' '\n' | grep nvshmem`) 46 | echo "Setting NVSHMEM_HOME=${NVSHMEM_HOME}" 47 | fi 48 | fi 49 | 50 | if [ -e ${LOG} ]; then 51 | echo "ERROR log file ${LOG} already exists." 52 | exit 1 53 | fi 54 | 55 | #DGX-1V 56 | #CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,3" "0,3,2" "0,3,2,1" "3,2,1,5,7" "0,3,2,1,5,4" "0,4,7,6,5,1,2" "0,3,2,1,5,6,7,4" ) 57 | #DGX A100 and DGX H100 58 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 59 | 60 | errors=0 61 | 62 | for entry in `ls -1`; do 63 | if [ -f ${entry}/Makefile ] ; then 64 | if [ "run" == "$1" ] ; then 65 | NUM_GPUS=`nvidia-smi -L | wc -l` 66 | for (( NP=1; NP<=${NUM_GPUS}; NP++ )) ; do 67 | export NP 68 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NP}]} 69 | CMD="make -C ${entry} $1" 70 | ${CMD} >> ${LOG} 2>&1 71 | if [ $? -ne 0 ]; then 72 | echo "ERROR with ${CMD} (NP = ${NP}, CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}) see ${LOG} for details." 73 | errors=1 74 | break 75 | fi 76 | done 77 | else 78 | CMD="make -C ${entry} $1" 79 | ${CMD} >> ${LOG} 2>&1 80 | if [ $? -ne 0 ]; then 81 | echo "ERROR with ${CMD} see ${LOG} for details." 82 | errors=1 83 | break 84 | fi 85 | fi 86 | fi 87 | done 88 | 89 | if [ ${errors} -eq 0 ]; then 90 | echo "Passed." 91 | exit 0 92 | else 93 | exit 1 94 | fi 95 | --------------------------------------------------------------------------------