├── docs ├── favicon.ico ├── requirements.txt ├── api.rst ├── Makefile ├── make.bat ├── start.rst └── conf.py ├── .gitmodules ├── include └── aluminum │ ├── traits │ ├── CMakeLists.txt │ └── traits.hpp │ ├── cuda │ ├── CMakeLists.txt │ ├── helper_kernels.hpp │ ├── events.hpp │ ├── sync_memory.hpp │ ├── gpu_wait.hpp │ ├── cuda_mempool.hpp │ ├── gpu_status_flag.hpp │ └── streams.hpp │ ├── utils │ ├── CMakeLists.txt │ ├── utils.hpp │ └── locked_resource_pool.hpp │ ├── mpi_cuda │ ├── CMakeLists.txt │ ├── rma_null.hpp │ ├── communicator.hpp │ └── rma_self.hpp │ ├── ht │ ├── CMakeLists.txt │ ├── barrier.hpp │ ├── communicator.hpp │ ├── alltoall.hpp │ ├── bcast.hpp │ ├── reduce_scatter.hpp │ ├── allreduce.hpp │ ├── reduce.hpp │ ├── reduce_scatterv.hpp │ └── allgather.hpp │ ├── mpi │ ├── CMakeLists.txt │ ├── barrier.hpp │ ├── base_state.hpp │ ├── bcast.hpp │ ├── alltoall.hpp │ ├── allgather.hpp │ ├── allreduce.hpp │ ├── reduce_scatter.hpp │ ├── gather.hpp │ ├── communicator.hpp │ ├── scatter.hpp │ ├── reduce.hpp │ └── reduce_scatterv.hpp │ ├── CMakeLists.txt │ ├── internal.hpp │ ├── datatypes.hpp │ ├── profiling.hpp │ ├── base.hpp │ └── trace.hpp ├── CONTRIBUTORS ├── util ├── CMakeLists.txt └── al_info.cpp ├── src ├── mpi_cuda │ ├── CMakeLists.txt │ └── communicator.cpp ├── cuda │ ├── CMakeLists.txt │ ├── helper_kernels.cu │ ├── gpu_wait.cpp │ ├── cuda.cpp │ └── gpu_status_flag.cpp ├── mempool.cpp ├── mpi_cuda_impl.cpp ├── ht_impl.cpp ├── profiling.cpp └── trace.cpp ├── .readthedocs.yaml ├── .gitignore ├── examples ├── CMakeLists.txt └── README.md ├── cmake ├── FindNVTX.cmake ├── FindCUB.cmake ├── FindHWLOC.cmake ├── FindNCCL.cmake ├── FindRoctracer.cmake ├── AluminumConfig.cmake.in └── tuning_params.hpp.in ├── LICENSE ├── CITATION.cff ├── al.svg ├── test ├── test_utils_mpi.hpp ├── test_stream_mem_ops.cpp ├── CMakeLists.txt ├── test_utils_mpi_cuda.hpp ├── test_utils_nccl.hpp └── test_utils_ht.hpp ├── benchmark ├── wait.hpp ├── benchmark_utils_mpi.hpp ├── CMakeLists.txt ├── benchmark_utils_nccl.hpp ├── benchmark_utils_ht.hpp ├── wait.cu ├── benchmark_events.cpp └── benchmark_waits.cpp └── README.md /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llnl/Aluminum/HEAD/docs/favicon.ico -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==6.1.3 2 | sphinx-rtd-theme==1.2.0 3 | breathe==4.35.0 4 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | Aluminum API Documentation 2 | ========================== 3 | 4 | .. doxygenfile:: Al.hpp 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/cxxopts"] 2 | path = third_party/cxxopts 3 | url = https://github.com/jarro2783/cxxopts.git 4 | -------------------------------------------------------------------------------- /include/aluminum/traits/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | ht_traits.hpp 3 | mpi_traits.hpp 4 | nccl_traits.hpp 5 | traits_base.hpp 6 | traits.hpp 7 | ) 8 | 9 | set(THIS_DIR_HEADERS_TO_INSTALL ${THIS_DIR_HEADERS}) 10 | 11 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE) 12 | -------------------------------------------------------------------------------- /include/aluminum/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | cuda.hpp 3 | cuda_mempool.hpp 4 | events.hpp 5 | gpu_status_flag.hpp 6 | gpu_wait.hpp 7 | helper_kernels.hpp 8 | streams.hpp 9 | sync_memory.hpp 10 | ) 11 | 12 | set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) 13 | -------------------------------------------------------------------------------- /include/aluminum/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | caching_allocator.hpp 3 | locked_resource_pool.hpp 4 | meta.hpp 5 | mpsc_queue.hpp 6 | spsc_queue.hpp 7 | utils.hpp 8 | ) 9 | 10 | set(THIS_DIR_HEADERS_TO_INSTALL ${THIS_DIR_HEADERS}) 11 | 12 | # Propagate the files up the tree 13 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE) 14 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | LLNL Team 2 | Nikoli Dryden 3 | Naoya Maruyama 4 | Tim Moon 5 | Tom Benson 6 | Andy Yoo 7 | Brian Van Essen 8 | Corey McNeish 9 | 10 | UIUC Team 11 | Nikoli Dryden 12 | Marc Snir 13 | -------------------------------------------------------------------------------- /util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This is a simple target. Just needs Al_config.hpp in its include path. 2 | add_executable(al_info al_info.cpp) 3 | target_link_libraries(al_info PRIVATE Al) 4 | install( 5 | TARGETS al_info 6 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 7 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 8 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 9 | INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 10 | -------------------------------------------------------------------------------- /include/aluminum/mpi_cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | communicator.hpp 3 | util.hpp 4 | ) 5 | 6 | set_source_path(THIS_DIR_CUDA_RMA_HEADERS 7 | rma.hpp 8 | rma_ipc.hpp 9 | rma_null.hpp 10 | rma_self.hpp 11 | ) 12 | 13 | if (AL_HAS_MPI_CUDA_RMA) 14 | list(APPEND THIS_DIR_HEADERS "${THIS_DIR_CUDA_RMA_HEADERS}") 15 | endif () 16 | 17 | # Propagate the files up the tree 18 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE) 19 | -------------------------------------------------------------------------------- /src/mpi_cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_CXX_SOURCES 2 | communicator.cpp 3 | ) 4 | 5 | set_source_path(THIS_DIR_CUDA_RMA_CXX_SOURCES 6 | rma.cpp 7 | ) 8 | 9 | if (AL_HAS_MPI_CUDA_RMA) 10 | list(APPEND THIS_DIR_CXX_SOURCES "${THIS_DIR_CUDA_RMA_CXX_SOURCES}") 11 | endif () 12 | 13 | # Propagate the files up the tree 14 | set(CXX_SOURCES ${CXX_SOURCES} ${THIS_DIR_CXX_SOURCES} PARENT_SCOPE) 15 | set(CUDA_SOURCES ${CUDA_SOURCES} ${THIS_DIR_CUDA_SOURCES} PARENT_SCOPE) 16 | -------------------------------------------------------------------------------- /src/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_CUDA_CXX_SOURCES 2 | cuda.cpp 3 | gpu_status_flag.cpp 4 | gpu_wait.cpp 5 | streams.cpp 6 | ) 7 | set_source_path(THIS_DIR_CUDA_SOURCES 8 | helper_kernels.cu 9 | ) 10 | 11 | if (AL_HAS_CUDA) 12 | list(APPEND THIS_DIR_CXX_SOURCES "${THIS_DIR_CUDA_CXX_SOURCES}") 13 | endif () 14 | 15 | set(CXX_SOURCES "${CXX_SOURCES}" "${THIS_DIR_CXX_SOURCES}" PARENT_SCOPE) 16 | set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CUDA_SOURCES}" PARENT_SCOPE) 17 | -------------------------------------------------------------------------------- /include/aluminum/ht/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | allgather.hpp 3 | allgatherv.hpp 4 | allreduce.hpp 5 | alltoall.hpp 6 | alltoallv.hpp 7 | barrier.hpp 8 | base_state.hpp 9 | bcast.hpp 10 | communicator.hpp 11 | gather.hpp 12 | gatherv.hpp 13 | multisendrecv.hpp 14 | reduce.hpp 15 | reduce_scatter.hpp 16 | reduce_scatterv.hpp 17 | scatter.hpp 18 | scatterv.hpp 19 | pt2pt.hpp 20 | ) 21 | 22 | # Propagate the files up the tree 23 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE) 24 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | -------------------------------------------------------------------------------- /include/aluminum/mpi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | allgather.hpp 3 | allgatherv.hpp 4 | allreduce.hpp 5 | alltoall.hpp 6 | alltoallv.hpp 7 | base_state.hpp 8 | barrier.hpp 9 | bcast.hpp 10 | communicator.hpp 11 | gather.hpp 12 | gatherv.hpp 13 | multisendrecv.hpp 14 | reduce.hpp 15 | reduce_scatter.hpp 16 | reduce_scatterv.hpp 17 | scatter.hpp 18 | scatterv.hpp 19 | pt2pt.hpp 20 | utils.hpp 21 | ) 22 | 23 | set(THIS_DIR_HEADERS_TO_INSTALL ${THIS_DIR_HEADERS}) 24 | 25 | # Propagate the files up the tree 26 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE) 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # Emacs stuff 35 | *~ 36 | \#*\# 37 | 38 | # Python stuff 39 | __pycache__/ 40 | *.py[cod] 41 | *$py.class 42 | 43 | # Dev stuff 44 | build*/ 45 | compile_commands.json 46 | .ccls-root 47 | .ccls-cache 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | 3 | project(AlExamples 4 | VERSION 0.0.1 5 | DESCRIPTION "Examples of how to use Aluminum" 6 | LANGUAGES CXX 7 | ) 8 | 9 | if (NOT CMAKE_BUILD_TYPE) 10 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Configuration type" FORCE) 11 | endif () 12 | 13 | find_package(Aluminum 1.0.0 REQUIRED) 14 | 15 | set(SOURCES 16 | hello_world.cpp 17 | allreduce.cpp 18 | pingpong.cpp 19 | ) 20 | 21 | foreach (src ${SOURCES}) 22 | string(REPLACE ".cpp" "" _tmp_exe_name "${src}") 23 | get_filename_component(_exe_name "${_tmp_exe_name}" NAME) 24 | add_executable(${_exe_name} ${src}) 25 | target_link_libraries(${_exe_name} PUBLIC ${Aluminum_LIBRARIES}) 26 | endforeach () 27 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ![Al](../al.svg) Aluminum Examples 2 | ================================== 3 | 4 | These are simple examples of how to use Aluminum. 5 | 6 | Current examples: 7 | * `hello_world`: Initialize Aluminum and have each process print its rank. 8 | * `allreduce`: Allreduce a buffer of data and print it afterward. 9 | * `pingpong`: Ranks alternate sending and receiving data. 10 | 11 | ## Building 12 | 13 | You can build the examples as follows. 14 | This assumes Aluminum has already been installed. 15 | 16 | ``` 17 | mkdir build 18 | cd build 19 | cmake .. 20 | make 21 | ``` 22 | 23 | If CMake cannot find the Aluminum library automatically, pass `-D Aluminum_DIR=/path/to/Aluminum`. 24 | 25 | You can edit the example codes to change certain parameters, like the Aluminum backend used. 26 | -------------------------------------------------------------------------------- /cmake/FindNVTX.cmake: -------------------------------------------------------------------------------- 1 | # Sets the following variables 2 | # 3 | # NVTX_FOUND 4 | # NVTX_LIBRARY 5 | # 6 | # Defines the following imported target: 7 | # 8 | # cuda::nvtx 9 | # 10 | 11 | find_library(NVTX_LIBRARY nvToolsExt 12 | HINTS ${NVTX_DIR} $ENV{NVTX_DIR} 13 | ${CUDAToolkit_LIBRARY_DIR} 14 | PATH_SUFFIXES lib64 15 | DOC "The nvtx library." 16 | NO_DEFAULT_PATH) 17 | find_library(NVTX_LIBRARY nvToolsExt) 18 | 19 | include(FindPackageHandleStandardArgs) 20 | find_package_handle_standard_args(NVTX 21 | DEFAULT_MSG NVTX_LIBRARY) 22 | 23 | if (NOT TARGET cuda::nvtx) 24 | 25 | add_library(cuda::nvtx INTERFACE IMPORTED) 26 | 27 | set_property(TARGET cuda::nvtx PROPERTY 28 | INTERFACE_INCLUDE_DIRECTORIES "${CUDA_INCLUDE_DIRS}") 29 | 30 | set_property(TARGET cuda::nvtx PROPERTY 31 | INTERFACE_LINK_LIBRARIES "${NVTX_LIBRARY}") 32 | 33 | endif (NOT TARGET cuda::nvtx) 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 2 | Lawrence Livermore National Laboratory in collaboration with University of 3 | Illinois Urbana-Champaign. 4 | 5 | Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 6 | the CONTRIBUTORS file. 7 | 8 | LLNL-CODE-756777. 9 | All rights reserved. 10 | 11 | This file is part of Aluminum GPU-aware Communication Library. For details, see 12 | http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 13 | 14 | Licensed under the Apache License, Version 2.0 (the "License"); you 15 | may not use this file except in compliance with the License. You may 16 | obtain a copy of the License at: 17 | 18 | http://www.apache.org/licenses/LICENSE-2.0 19 | 20 | Unless required by applicable law or agreed to in writing, software 21 | distributed under the License is distributed on an "AS IS" BASIS, 22 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 23 | implied. See the License for the specific language governing 24 | permissions and limitations under the license. 25 | 26 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: "Aluminum Communication Library" 3 | message: "If you use Aluminum, please cite it as" 4 | authors: 5 | - family-names: Dryden 6 | given-names: Nikoli 7 | - family-names: Maruyama 8 | given-names: Naoya 9 | - family-names: Moon 10 | given-names: Tim 11 | - family-names: Benson 12 | given-names: Tom 13 | - family-names: Yoo 14 | given-names: Andy 15 | - family-names: Van Essen 16 | given-names: Brian 17 | - family-names: McNeish 18 | given-names: Corey 19 | - family-names: Snir 20 | given-names: Marc 21 | preferred-citation: 22 | title: "Aluminum: An Asynchronous, GPU-Aware Communication Library Optimized for Large-Scale Training of Deep Neural Networks on HPC Systems" 23 | year: "2018" 24 | type: conference-paper 25 | collection-title: "Proceedings of the Workshop on Machine Learning in HPC Environments" 26 | authors: 27 | - family-names: Dryden 28 | given-names: Nikoli 29 | - family-names: Maruyama 30 | given-names: Naoya 31 | - family-names: Moon 32 | given-names: Tim 33 | - family-names: Benson 34 | given-names: Tom 35 | - family-names: Yoo 36 | given-names: Andy 37 | - family-names: Snir 38 | given-names: Marc 39 | - family-names: Van Essen 40 | given-names: Brian 41 | -------------------------------------------------------------------------------- /al.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/test_utils_mpi.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | -------------------------------------------------------------------------------- /docs/start.rst: -------------------------------------------------------------------------------- 1 | Getting Started with Aluminum 2 | ============================= 3 | 4 | Once you have :doc:`built ` Aluminum, you probably want to use it. 5 | Aluminum is in many respects quite similar to MPI, so if you have experience with that, things should be familiar. 6 | 7 | This is a simple "Hello, world" program that shows key aspects of Aluminum: 8 | 9 | .. code-block:: c++ 10 | 11 | #include 12 | #include 13 | 14 | int main(int argc, char** argv) { 15 | // Initialize Aluminum. 16 | // Must be called before any other Aluminum calls. 17 | Al::Initialize(argc, argv); 18 | 19 | // Create a communicator with all processes. 20 | typename Al::MPIBackend::comm_type comm; 21 | 22 | // Each process prints its rank and the communicator size. 23 | std::cout << "Hello, world, from rank " 24 | << comm.rank() << " of " 25 | << comm.size() << std::endl; 26 | 27 | // Do a simple (in-place) allreduce. 28 | int rank = comm.rank(); 29 | Al::Allreduce(&rank, 1, Al::ReductionOperator::sum, comm); 30 | std::cout << "The sum of ranks is " 31 | << rank << std::endl; 32 | 33 | // Clean up Aluminum. 34 | Al::Finalize(); 35 | 36 | return 0; 37 | } 38 | 39 | For additional examples and more detail (including accelerator backends), see the `Aluminum examples `_. 40 | -------------------------------------------------------------------------------- /src/mempool.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/mempool.hpp" 29 | 30 | namespace Al { 31 | namespace internal { 32 | 33 | MemoryPool mempool; 34 | 35 | } // namespace internal 36 | } // namespace Al 37 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | import subprocess 7 | import os 8 | import os.path 9 | 10 | if not os.path.isdir('_static'): 11 | os.makedirs('_static') 12 | 13 | # Generate Doxygen docs. 14 | subprocess.run(['doxygen', 'Doxyfile.in']) 15 | 16 | # -- Project information ----------------------------------------------------- 17 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 18 | 19 | project = 'Aluminum' 20 | copyright = '2018, Lawrence Livermore National Security' 21 | author = 'Lawrence Livermore National Laboratory' 22 | 23 | # -- General configuration --------------------------------------------------- 24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 25 | 26 | extensions = ['breathe'] 27 | 28 | templates_path = ['_templates'] 29 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 30 | 31 | rst_prolog = """ 32 | .. |AlLogo| image:: ../al.svg 33 | """ 34 | 35 | # -- Options for HTML output ------------------------------------------------- 36 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 37 | 38 | html_theme = 'sphinx_rtd_theme' 39 | html_static_path = ['_static'] 40 | html_favicon = 'favicon.ico' 41 | 42 | # Breathe configuration 43 | 44 | breathe_projects = {'Aluminum': '_doxyout/xml/'} 45 | breathe_default_project = 'Aluminum' 46 | -------------------------------------------------------------------------------- /cmake/FindCUB.cmake: -------------------------------------------------------------------------------- 1 | #[=============[.rst 2 | FindCUB 3 | ========== 4 | 5 | Finds the CUB library. 6 | 7 | The following variables will be defined:: 8 | 9 | CUB_FOUND - True if the system has the CUB library. 10 | CUB_INCLUDE_DIRS - The include directory needed for CUB. 11 | 12 | The following cache variable will be set and marked as "advanced":: 13 | 14 | CUB_INCLUDE_DIR - The include directory needed for CUB. 15 | 16 | In addition, the :prop_tgt:`IMPORTED` target ``cuda::CUB`` will 17 | be created. 18 | 19 | #]=============] 20 | 21 | 22 | find_path(CUB_INCLUDE_PATH cub/cub.cuh 23 | HINTS ${CUB_DIR} $ENV{CUB_DIR} 24 | ${CUDAToolkit_INCLUDE_DIRS} 25 | PATH_SUFFIXES include 26 | NO_DEFAULT_PATH 27 | DOC "The CUB header directory." 28 | ) 29 | find_path(CUB_INCLUDE_PATH cub/cub.cuh) 30 | 31 | set(CUB_INCLUDE_DIRS "${CUB_INCLUDE_PATH}") 32 | 33 | # Standard handling of the package arguments 34 | include(FindPackageHandleStandardArgs) 35 | find_package_handle_standard_args(CUB 36 | DEFAULT_MSG CUB_INCLUDE_PATH) 37 | 38 | # Setup the imported target 39 | if (NOT TARGET cuda::CUB) 40 | add_library(cuda::CUB INTERFACE IMPORTED) 41 | endif (NOT TARGET cuda::CUB) 42 | 43 | # Set the include directories for the target 44 | if (CUB_INCLUDE_PATH) 45 | set_property(TARGET cuda::CUB 46 | PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CUB_INCLUDE_PATH}) 47 | endif () 48 | 49 | # 50 | # Cleanup 51 | # 52 | 53 | # Set the include directories 54 | mark_as_advanced(FORCE CUB_INCLUDE_PATH) 55 | 56 | # Set the libraries 57 | set(CUB_LIBRARIES cuda::CUB) 58 | -------------------------------------------------------------------------------- /include/aluminum/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(THIS_DIR_HEADERS 2 | base.hpp 3 | datatypes.hpp 4 | internal.hpp 5 | mempool.hpp 6 | mpi_comm_and_stream_wrapper.hpp 7 | mpi_impl.hpp 8 | profiling.hpp 9 | progress.hpp 10 | state.hpp 11 | trace.hpp 12 | ) 13 | set_source_path(THIS_DIR_CUDA_HEADERS 14 | cuda.hpp 15 | helper_kernels.hpp 16 | ) 17 | set_source_path(THIS_DIR_MPI_CUDA_HEADERS 18 | mpi_cuda_impl.hpp 19 | ) 20 | set_source_path(THIS_DIR_HOST_TRANSFER_HEADERS 21 | ht_impl.hpp 22 | ) 23 | set_source_path(THIS_DIR_NCCL_HEADERS 24 | nccl_impl.hpp 25 | ) 26 | 27 | add_subdirectory(utils) 28 | 29 | add_subdirectory(mpi) 30 | 31 | if (AL_HAS_CUDA) 32 | add_subdirectory(cuda) 33 | endif () 34 | 35 | if (AL_HAS_MPI_CUDA) 36 | list(APPEND THIS_DIR_HEADERS "${THIS_DIR_MPI_CUDA_HEADERS}") 37 | 38 | add_subdirectory(mpi_cuda) 39 | endif () 40 | 41 | if (AL_HAS_HOST_TRANSFER) 42 | list(APPEND THIS_DIR_HEADERS "${THIS_DIR_HOST_TRANSFER_HEADERS}") 43 | add_subdirectory(ht) 44 | endif () 45 | 46 | if (AL_HAS_NCCL) 47 | list(APPEND THIS_DIR_HEADERS "${THIS_DIR_NCCL_HEADERS}") 48 | endif () 49 | 50 | add_subdirectory(traits) 51 | 52 | set(ALUMINUM_HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) 53 | 54 | # Note (trb 07/27/2020): The way generated files work in CMake is a 55 | # bit tricky -- they only work properly if they're used in the same 56 | # scope that they are created. So I'm moving all the install logic to 57 | # "/src/CMakeLists.txt", including the header installation. This 58 | # doesn't really matter for CUDA, but it's a problem for HIP/ROCm. 59 | -------------------------------------------------------------------------------- /benchmark/wait.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #if defined AL_HAS_ROCM 33 | #include 34 | #elif defined AL_HAS_CUDA 35 | #include 36 | #endif 37 | 38 | /** Cause the stream to wait for length seconds. */ 39 | void gpu_wait(double length, AlGpuStream_t stream); 40 | -------------------------------------------------------------------------------- /src/mpi_cuda_impl.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/mpi_cuda_impl.hpp" 29 | 30 | namespace Al { 31 | 32 | namespace internal { 33 | namespace mpi_cuda { 34 | 35 | void init(int&, char**&) { 36 | 37 | } 38 | 39 | void finalize() { 40 | 41 | } 42 | 43 | } // namespace mpi_cuda 44 | } // namespace internal 45 | } // namespace Al 46 | -------------------------------------------------------------------------------- /include/aluminum/traits/traits.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | /** 29 | * @file 30 | * Compile-time traits describing Aluminum communication options. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include 36 | 37 | #include "aluminum/traits/mpi_traits.hpp" 38 | 39 | #ifdef AL_HAS_NCCL 40 | #include "aluminum/traits/nccl_traits.hpp" 41 | #endif 42 | #ifdef AL_HAS_HOST_TRANSFER 43 | #include "aluminum/traits/ht_traits.hpp" 44 | #endif 45 | -------------------------------------------------------------------------------- /test/test_stream_mem_ops.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include 29 | #include "Al.hpp" 30 | 31 | int main(int argc, char** argv) { 32 | AL_CHECK_CUDA(AlGpuSetDevice(0)); 33 | Al::Initialize(argc, argv); 34 | if (Al::internal::cuda::stream_memory_operations_supported()) { 35 | std::cout << "Supported!" << std::endl; 36 | } else { 37 | std::cout << "Not supported! :(" << std::endl; 38 | } 39 | Al::Finalize(); 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /benchmark/benchmark_utils_mpi.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "Al.hpp" 31 | #include "benchmark_utils.hpp" 32 | 33 | 34 | template <> 35 | struct Timer { 36 | void start_timer(typename Al::MPIBackend::comm_type&) { 37 | start_time = Al::get_time(); 38 | } 39 | 40 | double end_timer(typename Al::MPIBackend::comm_type&) { 41 | return Al::get_time() - start_time; 42 | } 43 | 44 | double start_time = 0.0; 45 | }; 46 | -------------------------------------------------------------------------------- /cmake/FindHWLOC.cmake: -------------------------------------------------------------------------------- 1 | # Output variables 2 | # 3 | # HWLOC_FOUND 4 | # HWLOC_LIBRARIES 5 | # HWLOC_INCLUDE_PATH 6 | # 7 | # Also creates an imported target HWLOC::hwloc 8 | 9 | if (MPI_FOUND) 10 | list(APPEND _TMP_MPI_LIBS "${MPI_C_LIBRARIES}" "${MPI_CXX_LIBRARIES}") 11 | foreach (lib IN LISTS _TMP_MPI_LIBS) 12 | get_filename_component(_TMP_MPI_LIB_DIR "${lib}" DIRECTORY) 13 | list(APPEND _TMP_MPI_LIBRARY_DIRS ${_TMP_MPI_LIB_DIR}) 14 | endforeach () 15 | 16 | if (_TMP_MPI_LIBRARY_DIRS) 17 | list(REMOVE_DUPLICATES _TMP_MPI_LIBRARY_DIRS) 18 | endif () 19 | endif (MPI_FOUND) 20 | 21 | # Find the library 22 | find_library(HWLOC_LIBRARY hwloc 23 | HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${_TMP_MPI_LIBRARY_DIRS} 24 | PATH_SUFFIXES lib64 lib 25 | NO_DEFAULT_PATH) 26 | find_library(HWLOC_LIBRARY hwloc) 27 | 28 | # Find the header 29 | find_path(HWLOC_INCLUDE_PATH hwloc.h 30 | HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} 31 | ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH} 32 | PATH_SUFFIXES include 33 | NO_DEFAULT_PATH) 34 | find_path(HWLOC_INCLUDE_PATH hwloc.h) 35 | 36 | # Handle the find_package arguments 37 | include(FindPackageHandleStandardArgs) 38 | find_package_handle_standard_args( 39 | HWLOC DEFAULT_MSG HWLOC_LIBRARY HWLOC_INCLUDE_PATH) 40 | 41 | # Build the imported target 42 | if (NOT TARGET HWLOC::hwloc) 43 | add_library(HWLOC::hwloc INTERFACE IMPORTED) 44 | endif() 45 | 46 | set_property(TARGET HWLOC::hwloc 47 | PROPERTY INTERFACE_LINK_LIBRARIES ${HWLOC_LIBRARY}) 48 | 49 | if (NOT "/usr/include" STREQUAL "${HWLOC_INCLUDE_PATH}") 50 | set_property(TARGET HWLOC::hwloc 51 | PROPERTY INTERFACE_INCLUDE_DIRECTORIES 52 | ${HWLOC_INCLUDE_PATH}) 53 | endif () 54 | 55 | # Set the last of the output variables 56 | set(HWLOC_LIBRARIES HWLOC::hwloc) 57 | 58 | # Cleanup 59 | mark_as_advanced(FORCE HWLOC_INCLUDE_PATH) 60 | mark_as_advanced(FORCE HWLOC_LIBRARY) 61 | -------------------------------------------------------------------------------- /src/mpi_cuda/communicator.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "Al.hpp" 29 | #include "aluminum/mpi_cuda/communicator.hpp" 30 | #ifdef AL_HAS_MPI_CUDA_RMA 31 | #include "aluminum/mpi_cuda/rma.hpp" 32 | #endif 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace mpi_cuda { 37 | 38 | #ifdef AL_HAS_MPI_CUDA_RMA 39 | RMA &MPICUDACommunicator::get_rma() { 40 | if (!m_rma) 41 | m_rma = std::make_shared(*this); 42 | return *m_rma; 43 | } 44 | #endif 45 | 46 | MPICUDACommunicator::~MPICUDACommunicator() {} 47 | 48 | } // namespace mpi_cuda 49 | } // namespace internal 50 | } // namespace Al 51 | -------------------------------------------------------------------------------- /benchmark/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(AL_BENCHMARK_HEADERS 2 | benchmark_utils.hpp 3 | benchmark_utils_mpi.hpp 4 | benchmark_utils_nccl.hpp 5 | benchmark_utils_ht.hpp 6 | wait.hpp) 7 | 8 | set_source_path(AL_BENCHMARK_SOURCES 9 | benchmark_ops.cpp 10 | bandwidth.cpp) 11 | 12 | if (AL_HAS_CUDA OR AL_HAS_ROCM) 13 | set_source_path(AL_GPU_BENCHMARK_SOURCES 14 | benchmark_waits.cpp 15 | benchmark_events.cpp) 16 | 17 | set_source_path(AL_GPU_BENCHMARK_HELPER_SOURCES 18 | wait.cu 19 | wait.hpp) 20 | endif () 21 | 22 | foreach(src IN LISTS AL_BENCHMARK_SOURCES AL_GPU_BENCHMARK_SOURCES) 23 | string(REPLACE ".cpp" "" _tmp_benchmark_exe_name "${src}") 24 | get_filename_component(_benchmark_exe_name 25 | "${_tmp_benchmark_exe_name}" NAME) 26 | add_executable(${_benchmark_exe_name} ${src} ${AL_BENCHMARK_HEADERS}) 27 | 28 | # Get the test headers 29 | target_link_libraries(${_benchmark_exe_name} 30 | PRIVATE Al aluminum_test_headers) 31 | target_include_directories( 32 | ${_benchmark_exe_name} SYSTEM PRIVATE 33 | ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include) 34 | 35 | # FIXME: Hopefully this can be removed in a future version of ROCm. 36 | if (AL_HAS_ROCM AND AL_BUILD_TYPE_UPPER MATCHES "DEBUG") 37 | target_compile_options(${_benchmark_exe_name} PRIVATE "-O0") 38 | endif () 39 | endforeach() 40 | 41 | # Handle the GPU-specific ones 42 | foreach (src ${AL_GPU_BENCHMARK_SOURCES}) 43 | string(REPLACE ".cpp" "" _tmp_benchmark_exe_name "${src}") 44 | get_filename_component(_benchmark_exe_name 45 | "${_tmp_benchmark_exe_name}" NAME) 46 | target_sources(${_benchmark_exe_name} 47 | PUBLIC "${AL_GPU_BENCHMARK_HELPER_SOURCES}") 48 | endforeach () 49 | 50 | # Get languages right 51 | if (AL_HAS_ROCM) 52 | list(FILTER AL_GPU_BENCHMARK_HELPER_SOURCES INCLUDE REGEX "cu$") 53 | set_source_files_properties(${AL_GPU_BENCHMARK_HELPER_SOURCES} 54 | PROPERTIES LANGUAGE HIP) 55 | endif () 56 | -------------------------------------------------------------------------------- /include/aluminum/internal.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | 32 | namespace Al { 33 | 34 | /** 35 | * Internal implementations. 36 | * Generic code for all collective implementations is in here. 37 | * Implementation-specific code is in separate namespaces inside internal. 38 | */ 39 | namespace internal { 40 | 41 | // Would be nice to replace this with a C++14 variable template... 42 | /** Indicator that an in-place allreduce is requested. */ 43 | template 44 | inline T* IN_PLACE() { return (T*) (-1); } 45 | 46 | } // namespace internal 47 | } // namespace Al 48 | -------------------------------------------------------------------------------- /src/ht_impl.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/ht_impl.hpp" 29 | 30 | namespace Al { 31 | 32 | // Initialize this. 33 | AlGpuEvent_t HostTransferBackend::sync_event = (AlGpuEvent_t) 0; 34 | 35 | namespace internal { 36 | namespace ht { 37 | 38 | void init(int&, char**&) { 39 | AL_CHECK_CUDA(AlGpuEventCreateWithFlags(&HostTransferBackend::sync_event, 40 | AlGpuNoTimingEventFlags)); 41 | } 42 | 43 | void finalize() { 44 | AL_CHECK_CUDA(AlGpuEventDestroy(HostTransferBackend::sync_event)); 45 | } 46 | 47 | } // namespace ht 48 | } // namespace internal 49 | } // namespace Al 50 | -------------------------------------------------------------------------------- /include/aluminum/cuda/helper_kernels.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/base.hpp" 31 | #include "aluminum/cuda/cuda.hpp" 32 | 33 | namespace Al { 34 | namespace internal { 35 | namespace cuda { 36 | 37 | void launch_wait_kernel(AlGpuStream_t stream, int32_t wait_value, 38 | volatile int32_t* wait_mem); 39 | 40 | #if defined AL_HAS_ROCM 41 | void launch_wait_kernel(hipStream_t stream, 42 | int32_t wait_value, 43 | hipDeviceptr_t wait_mem); 44 | #elif defined AL_HAS_CUDA 45 | void launch_wait_kernel(cudaStream_t stream, int32_t wait_value, 46 | CUdeviceptr wait_mem); 47 | #endif 48 | 49 | } // namespace cuda 50 | } // namespace internal 51 | } // namespace Al 52 | -------------------------------------------------------------------------------- /include/aluminum/datatypes.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | // This file identifies support for different specialized datatypes and 31 | // provides some basic things for them as needed. 32 | 33 | #include 34 | 35 | // IEEE 16 bit floating point (i.e., fp16 or half). 36 | 37 | #if defined AL_HAS_ROCM 38 | #include 39 | #define AL_HAS_HALF 1 40 | #elif defined AL_HAS_CUDA 41 | #include 42 | #define AL_HAS_HALF 1 43 | #endif 44 | 45 | // Brain floating point 16 (bfloat16). 46 | 47 | #if defined AL_HAS_ROCM 48 | #include 49 | #define AL_HAS_BFLOAT 1 50 | using al_bfloat16 = __hip_bfloat16; 51 | 52 | #elif defined AL_HAS_CUDA 53 | #include 54 | #define AL_HAS_BFLOAT 1 55 | using al_bfloat16 = __nv_bfloat16; 56 | #endif 57 | -------------------------------------------------------------------------------- /include/aluminum/utils/utils.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | 33 | namespace Al { 34 | 35 | /** Return time, in seconds (with decimal), since a fixed epoch. */ 36 | inline double get_time() { 37 | using namespace std::chrono; 38 | return duration_cast>( 39 | steady_clock::now().time_since_epoch()).count(); 40 | } 41 | 42 | /** 43 | * Compute an exclusive prefix sum. 44 | * 45 | * This is mostly meant to help with vector collectives. 46 | */ 47 | template 48 | inline std::vector excl_prefix_sum(const std::vector& v) { 49 | auto r = std::vector(v.size(), T{0}); 50 | for (size_t i = 1; i < v.size(); ++i) { 51 | r[i] = v[i-1] + r[i-1]; 52 | } 53 | return r; 54 | } 55 | 56 | } // namespace Al 57 | -------------------------------------------------------------------------------- /include/aluminum/ht/barrier.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | class BarrierAlState : public HostTransferCollectiveSignalAtEndState { 39 | public: 40 | BarrierAlState(HostTransferCommunicator& comm_, AlGpuStream_t stream_) : 41 | HostTransferCollectiveSignalAtEndState(stream_), 42 | comm(comm_.get_comm()) { 43 | // Just wait until we should start this. 44 | start_event.record(stream_); 45 | 46 | // Have the device wait on the host. 47 | gpu_wait.wait(stream_); 48 | end_event.record(stream_); 49 | } 50 | 51 | std::string get_name() const override { return "HTBarrier"; } 52 | 53 | protected: 54 | void start_mpi_op() override { 55 | MPI_Ibarrier(comm, get_mpi_req()); 56 | } 57 | 58 | private: 59 | MPI_Comm comm; 60 | }; 61 | 62 | } // namespace ht 63 | } // namespace internal 64 | } // namespace Al 65 | -------------------------------------------------------------------------------- /include/aluminum/cuda/events.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/utils/locked_resource_pool.hpp" 31 | #include "aluminum/cuda/cuda.hpp" 32 | 33 | namespace Al { 34 | namespace internal { 35 | namespace cuda { 36 | 37 | // TODO: May want to allocate larger chunks and partition. 38 | 39 | /** 40 | * Allocate CUDA pinned memory such that there is one allocation per 41 | * cache line. 42 | */ 43 | struct CUDAEventAllocator { 44 | AlGpuEvent_t allocate() { 45 | AlGpuEvent_t event; 46 | AL_CHECK_CUDA( 47 | AlGpuEventCreateWithFlags(&event, 48 | AlGpuNoTimingEventFlags)); 49 | return event; 50 | } 51 | 52 | void deallocate(AlGpuEvent_t event) { 53 | AL_CHECK_CUDA(AlGpuEventDestroy(event)); 54 | } 55 | }; 56 | 57 | /** Resource pool for synchronization memory. */ 58 | extern Al::internal::LockedResourcePool event_pool; 60 | 61 | } // namespace cuda 62 | } // namespace internal 63 | } // namespace Al 64 | -------------------------------------------------------------------------------- /include/aluminum/mpi_cuda/rma_null.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/mpi_cuda/communicator.hpp" 31 | #include "aluminum/mpi_cuda/rma.hpp" 32 | 33 | namespace Al { 34 | namespace internal { 35 | namespace mpi_cuda { 36 | 37 | class ConnectionNULL: public Connection { 38 | public: 39 | ConnectionNULL(MPICUDACommunicator &comm, int peer): 40 | Connection(comm, peer) {} 41 | ~ConnectionNULL() {} 42 | void connect() {} 43 | void disconnect() {} 44 | void *attach_remote_buffer(void *) { 45 | return nullptr; 46 | } 47 | void detach_remote_buffer(void *) {} 48 | void detach_all_remote_buffers() {} 49 | void notify(mpi::AlMPIReq &req) { 50 | req->store(true, std::memory_order_release); 51 | } 52 | void wait(mpi::AlMPIReq &req) { 53 | req->store(true, std::memory_order_release); 54 | } 55 | void sync(mpi::AlMPIReq &req) { 56 | req->store(true, std::memory_order_release); 57 | } 58 | void put(const void *, void *, size_t) {} 59 | }; 60 | 61 | } // namespace mpi_cuda 62 | } // namespace internal 63 | } // namespace Al 64 | -------------------------------------------------------------------------------- /cmake/FindNCCL.cmake: -------------------------------------------------------------------------------- 1 | # Exports the following variables 2 | # 3 | # NCCL_FOUND 4 | # NCCL_INCLUDE_PATH 5 | # NCCL_LIBRARY 6 | # 7 | # Exports the following IMPORTED targets: 8 | # 9 | # cuda::nccl 10 | # 11 | 12 | find_path(NCCL_INCLUDE_PATH nccl.h 13 | HINTS ${NCCL_DIR} $ENV{NCCL_DIR} ${NCCL2_DIR} $ENV{NCCL2_DIR} 14 | ${CUDAToolkit_INCLUDE_DIRS} 15 | PATH_SUFFIXES include 16 | NO_DEFAULT_PATH 17 | DOC "The location of NCCL headers." 18 | ) 19 | find_path(NCCL_INCLUDE_PATH nccl.h) 20 | 21 | find_library(NCCL_LIBRARY nccl 22 | HINTS ${NCCL_DIR} $ENV{NCCL_DIR} ${NCCL2_DIR} $ENV{NCCL2_DIR} 23 | ${CUDAToolkit_LIBRARY_DIR} 24 | PATH_SUFFIXES lib64 lib 25 | NO_DEFAULT_PATH 26 | DOC "The NCCL library." 27 | ) 28 | find_library(NCCL_LIBRARY nccl) 29 | 30 | # If the include path has been found, we can test the version. 31 | if (NCCL_INCLUDE_PATH) 32 | 33 | # Check the version. Note, this won't compile for NCCL1 34 | set(_NCCL_VERSION_TEST_SRC " 35 | #include 36 | #include 37 | 38 | int main() 39 | { 40 | std::cout << NCCL_MAJOR << \".\" << NCCL_MINOR << \".\" << NCCL_PATCH; 41 | return 0; 42 | } 43 | ") 44 | 45 | file(WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx" 46 | "${_NCCL_VERSION_TEST_SRC}\n") 47 | 48 | try_run(_NCCL_RUN_RESULT _NCCL_COMPILE_RESULT 49 | ${CMAKE_BINARY_DIR} 50 | ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx 51 | CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${CUDAToolkit_INCLUDE_DIRS};${NCCL_INCLUDE_PATH}" 52 | RUN_OUTPUT_VARIABLE _NCCL_VERSION_STRING 53 | COMPILE_OUTPUT_VARIABLE _NCCL_COMPILE_OUTPUT 54 | ) 55 | 56 | # Assume that if it didn't compile, we have NCCL1 57 | if (NOT _NCCL_COMPILE_RESULT) 58 | message(${_NCCL_COMPILE_OUTPUT}) 59 | set(_NCCL_VERSION_STRING 1.0.0) 60 | endif () 61 | endif () 62 | 63 | # Standard handling of the package arguments 64 | include(FindPackageHandleStandardArgs) 65 | find_package_handle_standard_args(NCCL 66 | REQUIRED_VARS NCCL_LIBRARY NCCL_INCLUDE_PATH 67 | VERSION_VAR _NCCL_VERSION_STRING) 68 | 69 | # Setup the imported target 70 | if (NCCL_FOUND AND NOT TARGET cuda::nccl) 71 | 72 | add_library(cuda::nccl INTERFACE IMPORTED) 73 | 74 | set_property(TARGET cuda::nccl PROPERTY 75 | INTERFACE_INCLUDE_DIRECTORIES ${NCCL_INCLUDE_PATH} ${CUDA_INCLUDE_DIRS}) 76 | 77 | set_property(TARGET cuda::nccl PROPERTY 78 | INTERFACE_LINK_LIBRARIES ${NCCL_LIBRARY}) 79 | 80 | endif (NCCL_FOUND AND NOT TARGET cuda::nccl) 81 | -------------------------------------------------------------------------------- /include/aluminum/mpi/barrier.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | inline void passthrough_barrier(MPICommunicator& comm) { 40 | MPI_Barrier(comm.get_comm()); 41 | } 42 | 43 | class BarrierAlState : public MPIState { 44 | public: 45 | BarrierAlState(MPICommunicator& comm_, AlMPIReq req_) 46 | : MPIState(req_), comm(comm_.get_comm()) {} 47 | ~BarrierAlState() override {} 48 | 49 | std::string get_name() const override { return "MPIBarrier"; } 50 | 51 | protected: 52 | void start_mpi_op() override { 53 | MPI_Ibarrier(comm, get_mpi_req()); 54 | } 55 | 56 | private: 57 | MPI_Comm comm; 58 | }; 59 | 60 | inline void passthrough_nb_barrier(MPICommunicator& comm, AlMPIReq& req) { 61 | req = get_free_request(); 62 | internal::mpi::BarrierAlState* state = 63 | new internal::mpi::BarrierAlState(comm, req); 64 | get_progress_engine()->enqueue(state); 65 | } 66 | 67 | } // namespace mpi 68 | } // namespace internal 69 | } // namespace Al 70 | -------------------------------------------------------------------------------- /benchmark/benchmark_utils_nccl.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "Al.hpp" 31 | #include "benchmark_utils.hpp" 32 | 33 | 34 | template <> 35 | struct Timer { 36 | Timer() { 37 | AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&start_event, AlGpuDefaultEventFlags)); 38 | AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&end_event, AlGpuDefaultEventFlags)); 39 | } 40 | 41 | ~Timer() noexcept(false) { 42 | AL_FORCE_CHECK_GPU(AlGpuEventDestroy(start_event)); 43 | AL_FORCE_CHECK_GPU(AlGpuEventDestroy(end_event)); 44 | } 45 | 46 | void start_timer(typename Al::NCCLBackend::comm_type& comm) { 47 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(start_event, comm.get_stream())); 48 | } 49 | 50 | double end_timer(typename Al::NCCLBackend::comm_type &comm) { 51 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(end_event, comm.get_stream())); 52 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventSynchronize(end_event)); 53 | float elapsed_time; 54 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventElapsedTime( 55 | &elapsed_time, start_event, end_event)); 56 | // Convert milliseconds to seconds. 57 | return elapsed_time / 1000.0; 58 | } 59 | 60 | AlGpuEvent_t start_event; 61 | AlGpuEvent_t end_event; 62 | }; 63 | -------------------------------------------------------------------------------- /benchmark/benchmark_utils_ht.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "Al.hpp" 31 | #include "benchmark_utils.hpp" 32 | 33 | 34 | template <> 35 | struct Timer { 36 | Timer() { 37 | AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&start_event, AlGpuDefaultEventFlags)); 38 | AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&end_event, AlGpuDefaultEventFlags)); 39 | } 40 | 41 | ~Timer() noexcept(false) { 42 | AL_FORCE_CHECK_GPU(AlGpuEventDestroy(start_event)); 43 | AL_FORCE_CHECK_GPU(AlGpuEventDestroy(end_event)); 44 | } 45 | 46 | void start_timer(typename Al::HostTransferBackend::comm_type& comm) { 47 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(start_event, comm.get_stream())); 48 | } 49 | 50 | double end_timer(typename Al::HostTransferBackend::comm_type &comm) { 51 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(end_event, comm.get_stream())); 52 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventSynchronize(end_event)); 53 | float elapsed_time; 54 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventElapsedTime( 55 | &elapsed_time, start_event, end_event)); 56 | // Convert milliseconds to seconds. 57 | return elapsed_time / 1000.0; 58 | } 59 | 60 | AlGpuEvent_t start_event; 61 | AlGpuEvent_t end_event; 62 | }; 63 | -------------------------------------------------------------------------------- /include/aluminum/cuda/sync_memory.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/utils/locked_resource_pool.hpp" 31 | #include "aluminum/cuda/cuda.hpp" 32 | #include "aluminum/tuning_params.hpp" 33 | #include 34 | #include 35 | 36 | namespace Al { 37 | namespace internal { 38 | namespace cuda { 39 | 40 | // TODO: May want to allocate larger chunks and partition. 41 | 42 | /** 43 | * Allocate CUDA pinned memory such that there is one allocation per 44 | * cache line. 45 | */ 46 | struct CacheLinePinnedMemoryAllocator { 47 | int32_t *allocate() { 48 | // Overallocate to avoid interference. 49 | int32_t *mem = (int32_t *)std::aligned_alloc( 50 | AL_DESTRUCTIVE_INTERFERENCE_SIZE, AL_DESTRUCTIVE_INTERFERENCE_SIZE); 51 | AL_CHECK_CUDA(AlGpuHostRegister(mem, AL_DESTRUCTIVE_INTERFERENCE_SIZE, 52 | AlGpuHostRegisterDefault)); 53 | return mem; 54 | } 55 | 56 | void deallocate(int32_t* mem) { 57 | AL_CHECK_CUDA(AlGpuHostUnregister(mem)); 58 | std::free(mem); 59 | } 60 | }; 61 | 62 | /** Resource pool for synchronization memory. */ 63 | extern Al::internal::LockedResourcePool sync_pool; 64 | 65 | } // namespace cuda 66 | } // namespace internal 67 | } // namespace Al 68 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set_source_path(TEST_BASE_HDRS 2 | algo_support.hpp 3 | test_utils.hpp 4 | test_utils_mpi.hpp 5 | op_dispatcher.hpp 6 | op_runner.hpp 7 | hang_watchdog.hpp) 8 | if (AL_HAS_CUDA) 9 | set_source_path(TEST_CUDA_HDRS cuda_vector.hpp) 10 | endif () 11 | if (AL_HAS_MPI_CUDA) 12 | set_source_path(TEST_MPI_CUDA_HDRS test_utils_mpi_cuda.hpp) 13 | endif () 14 | if (AL_HAS_HOST_TRANSFER) 15 | set_source_path(TEST_HOST_TRANSFER_HDRS test_utils_ht.hpp) 16 | endif () 17 | if (AL_HAS_NCCL) 18 | set_source_path(TEST_NCCL_HDRS test_utils_nccl.hpp) 19 | endif () 20 | 21 | set(TEST_HEADERS 22 | ${TEST_BASE_HDRS} 23 | ${TEST_CUDA_HDRS} 24 | ${TEST_MPI_CUDA_HDRS} 25 | ${TEST_HOST_TRANSFER_HDRS} 26 | ${TEST_NCCL_HDRS}) 27 | 28 | # These headers are used in the benchmark/ directory 29 | add_library(aluminum_test_headers INTERFACE "${TEST_HEADERS}") 30 | target_include_directories( 31 | aluminum_test_headers INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}") 32 | 33 | # Make a quick exit before we add any tests 34 | if (NOT AL_ENABLE_TESTS) 35 | return() 36 | endif () 37 | 38 | set_source_path(AL_TEST_SOURCES 39 | test_ops.cpp 40 | test_exchange.cpp 41 | ) 42 | 43 | set_source_path(AL_GPU_ONLY_TEST_SOURCES 44 | test_stream_mem_ops.cpp 45 | ) 46 | 47 | if (AL_HAS_CUDA OR AL_HAS_ROCM) 48 | list(APPEND AL_TEST_SOURCES ${AL_GPU_ONLY_TEST_SOURCES}) 49 | endif () 50 | 51 | foreach(src ${AL_TEST_SOURCES}) 52 | string(REPLACE ".cpp" "" _tmp_test_exe_name "${src}") 53 | get_filename_component(_test_exe_name "${_tmp_test_exe_name}" NAME) 54 | add_executable(${_test_exe_name} ${src}) 55 | target_include_directories(${_test_exe_name} 56 | SYSTEM PRIVATE 57 | ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include) 58 | target_link_libraries(${_test_exe_name} PRIVATE Al aluminum_test_headers) 59 | # FIXME: Hopefully this can be removed in a future version of ROCm. 60 | if (AL_HAS_ROCM AND AL_BUILD_TYPE_UPPER MATCHES "DEBUG") 61 | target_compile_options(${_test_exe_name} PRIVATE "-O0") 62 | endif () 63 | endforeach() 64 | 65 | if (AL_HAS_MPI_CUDA_RMA AND NOT AL_HAS_ROCM) 66 | add_executable(test_rma_ring.exe test_rma_ring.cpp ${TEST_HEADERS}) 67 | target_include_directories( 68 | test_rma_ring.exe SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include) 69 | target_link_libraries(test_rma_ring.exe PRIVATE Al) 70 | add_executable(test_rma_halo_exchange.exe 71 | test_rma_halo_exchange.cpp ${TEST_HEADERS}) 72 | target_include_directories( 73 | test_rma_halo_exchange.exe SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include) 74 | target_link_libraries(test_rma_halo_exchange.exe PRIVATE Al) 75 | endif () 76 | -------------------------------------------------------------------------------- /include/aluminum/profiling.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #ifdef AL_HAS_NVPROF 36 | #include 37 | #endif 38 | #ifdef AL_HAS_ROCTRACER 39 | #include 40 | #endif 41 | #if defined AL_HAS_ROCM 42 | #include 43 | #elif defined AL_HAS_CUDA 44 | #include 45 | #endif 46 | 47 | namespace Al { 48 | namespace internal { 49 | namespace profiling { 50 | 51 | /** Assign a name to the thread given by handle. */ 52 | void name_thread(std::thread::native_handle_type handle, std::string name); 53 | #ifdef AL_HAS_CUDA 54 | /** Assign a name to a CUDA stream. */ 55 | void name_stream(AlGpuStream_t stream, std::string name); 56 | #endif 57 | 58 | /** Create an instantaneous marker. */ 59 | void mark(std::string desc); 60 | 61 | /** Represent a range for profiling. */ 62 | struct ProfileRange { 63 | #ifdef AL_HAS_NVPROF 64 | nvtxRangeId_t nvtx_range; 65 | #endif 66 | #ifdef AL_HAS_ROCTRACER 67 | roctx_range_id_t roctx_range; 68 | #endif 69 | }; 70 | 71 | /** Start a profiling region with name. */ 72 | ProfileRange prof_start(std::string name); 73 | /** End a profiling region. */ 74 | void prof_end(ProfileRange range); 75 | 76 | } // namespace profiling 77 | } // namespace internal 78 | } // namespace Al 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Al](al.svg) Aluminum 2 | ====================== 3 | 4 | **Aluminum** is a high-performance communication library for CPUs, GPUs, and other accelerator platforms. 5 | It leverages existing libraries, such as MPI, NCCL, and RCCL, plus its own infrastructure, to deliver performance and accelerator-centric communication. 6 | 7 | Aluminum is open-source and maintained by the Lawrence Livermore National Laboratory. 8 | If you use Aluminum, please cite [our paper](https://ieeexplore.ieee.org/document/8638639): 9 | ``` 10 | @inproceedings{dryden2018aluminum, 11 | title={Aluminum: An Asynchronous, {GPU}-Aware Communication Library Optimized for Large-Scale Training of Deep Neural Networks on {HPC} Systems}, 12 | author={Dryden, Nikoli and Maruyama, Naoya and Moon, Tim and Benson, Tom and Yoo, Andy and Snir, Marc and Van Essen, Brian}, 13 | booktitle={Proceedings of the Workshop on Machine Learning in HPC Environments (MLHPC)}, 14 | year={2018} 15 | } 16 | ``` 17 | 18 | ## Features 19 | 20 | * Support for blocking and non-blocking collective and point-to-point operations 21 | * Accelerator-centric communication 22 | * Supported communication backends: 23 | * `MPI`: Uses the Message Passing Interface and supports any hardware your underlying MPI library supports. 24 | * `NCCL`: Uses either Nvidia's [NCCL](https://developer.nvidia.com/nccl) library for Nvidia GPUs or AMD's [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) library for AMD GPUs. 25 | * `HostTransfer`: Uses MPI plus the CUDA or HIP runtime to support Nvidia or AMD GPUs without specialized libraries. 26 | 27 | ## Getting Started 28 | 29 | For full details, see the [Aluminum documentation](https://aluminum.readthedocs.io/). 30 | 31 | For basic usage examples, see the [examples](examples). 32 | 33 | ### Building and Installation 34 | 35 | Aluminum is available via [Spack](https://spack.io/) or can be installed manually from source. 36 | 37 | Source builds need a recent CMake, C++ compiler (with support for C++17), MPI, and hwloc. 38 | Accelerator backends need the appropriate runtime libraries. 39 | 40 | A basic out-of-source build can be done with 41 | ``` 42 | mkdir build && cd build 43 | cmake /path/to/Aluminum/source 44 | ``` 45 | 46 | For full details on building, configuration, testing, and benchmarking, see the [documentation](https://aluminum.readthedocs.io/en/latest/build.html). 47 | 48 | ## Authors 49 | 50 | * [Nikoli Dryden](https://github.com/ndryden) 51 | * [Naoya Maruyama](https://github.com/naoyam) 52 | * [Tom Benson](https://github.com/benson31) 53 | * Andy Yoo 54 | 55 | See also [contributors](https://github.com/ndryden/Aluminum/graphs/contributors). 56 | 57 | ## License 58 | 59 | Aluminum is licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for details. 60 | -------------------------------------------------------------------------------- /cmake/FindRoctracer.cmake: -------------------------------------------------------------------------------- 1 | # Sets the following variables 2 | # 3 | # Roctracer_FOUND 4 | # Roctracer_LIBRARIES 5 | # 6 | # Defines the following imported target: 7 | # 8 | # roctracer::roctracer 9 | # roctracer::roctracer_api 10 | # roctracer::roctx_api 11 | # 12 | 13 | set(_supported_components roctracer roctx) 14 | if (NOT Roctracer_FIND_COMPONENTS) 15 | set(Roctracer_FIND_COMPONENTS ${_supported_components}) 16 | endif () 17 | 18 | foreach (comp IN LISTS Roctracer_FIND_COMPONENTS) 19 | if (NOT ${comp} IN_LIST _supported_components) 20 | message(FATAL_ERROR 21 | "Cannot specify component \"${comp}\" for package Roctracer. " 22 | "Supported components are: ${_supported_components}.") 23 | endif () 24 | 25 | set(_header_name "${comp}.h") 26 | set(_lib_name "${comp}64") 27 | 28 | find_path(${comp}_INCLUDE_PATH ${_header_name} 29 | HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer 30 | PATH_SUFFIXES include 31 | DOC "The ${comp} include directory for roctracer." 32 | NO_DEFAULT_PATH) 33 | find_path(${comp}_INCLUDE_PATH ${_header_name} 34 | HINTS ${ROCM_PATH}/include/roctracer $ENV{ROCM_PATH}/include/roctracer 35 | DOC "The ${comp} include directory for roctracer." 36 | NO_DEFAULT_PATH) 37 | find_path(${comp}_INCLUDE_PATH ${_header_name}) 38 | 39 | find_library(${comp}_LIBRARY ${_lib_name} 40 | HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer 41 | HINTS ${ROCM_PATH} $ENV{ROCM_PATH} 42 | PATH_SUFFIXES lib64 lib 43 | DOC "The ${comp} library for roctracer." 44 | NO_DEFAULT_PATH) 45 | find_library(${comp}_LIBRARY ${_lib_name}) 46 | 47 | if (${comp}_LIBRARY AND ${comp}_INCLUDE_PATH) 48 | set(Roctracer_${comp}_FOUND TRUE) 49 | 50 | if (NOT TARGET roctracer::${comp}_api) 51 | add_library(roctracer::${comp}_api INTERFACE IMPORTED) 52 | endif () 53 | target_link_libraries(roctracer::${comp}_api INTERFACE 54 | "${${comp}_LIBRARY}") 55 | target_include_directories(roctracer::${comp}_api INTERFACE 56 | "${${comp}_INCLUDE_PATH}") 57 | 58 | mark_as_advanced(${comp}_LIBRARY) 59 | mark_as_advanced(${comp}_INCLUDE_PATH) 60 | 61 | list(APPEND _imported_libraries roctracer::${comp}_api) 62 | else () 63 | set(Roctracer_${comp}_FOUND FALSE) 64 | endif () 65 | endforeach () 66 | 67 | include(FindPackageHandleStandardArgs) 68 | find_package_handle_standard_args(Roctracer HANDLE_COMPONENTS) 69 | 70 | if (Roctracer_FOUND) 71 | if (NOT TARGET roctracer::roctracer) 72 | add_library(roctracer::roctracer INTERFACE IMPORTED) 73 | endif () 74 | foreach (lib IN LISTS _imported_libraries) 75 | target_link_libraries(roctracer::roctracer INTERFACE ${lib}) 76 | endforeach () 77 | set(Roctracer_LIBRARIES roctracer::roctracer) 78 | endif (Roctracer_FOUND) 79 | -------------------------------------------------------------------------------- /include/aluminum/mpi_cuda/communicator.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include "Al.hpp" 32 | #include "aluminum/mpi_comm_and_stream_wrapper.hpp" 33 | #include "aluminum/mpi/communicator.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi_cuda { 38 | 39 | #ifdef AL_HAS_MPI_CUDA_RMA 40 | class RMA; 41 | #endif 42 | 43 | class MPICUDACommunicator: public MPICommAndStreamWrapper { 44 | public: 45 | MPICUDACommunicator() 46 | : MPICUDACommunicator(mpi::get_world_comm().get_comm(), 0) {} 47 | MPICUDACommunicator(MPI_Comm comm_, cudaStream_t stream_) 48 | : MPICommAndStreamWrapper(comm_, stream_) 49 | #ifdef AL_HAS_MPI_CUDA_RMA 50 | , m_rma(nullptr) 51 | #endif 52 | {} 53 | MPICUDACommunicator(const MPICUDACommunicator& other) = delete; 54 | MPICUDACommunicator(MPICUDACommunicator&& other) = default; 55 | MPICUDACommunicator& operator=(const MPICUDACommunicator& other) = delete; 56 | MPICUDACommunicator& operator=(MPICUDACommunicator&& other) = default; 57 | 58 | #ifdef AL_HAS_MPI_CUDA_RMA 59 | RMA &get_rma(); 60 | #endif 61 | 62 | ~MPICUDACommunicator(); 63 | 64 | MPICUDACommunicator copy(cudaStream_t stream = 0) const { 65 | return MPICUDACommunicator(get_comm(), stream); 66 | } 67 | 68 | protected: 69 | #ifdef AL_HAS_MPI_CUDA_RMA 70 | std::shared_ptr m_rma; 71 | #endif 72 | }; 73 | 74 | } // namespace mpi_cuda 75 | } // namespace internal 76 | } // namespace Al 77 | -------------------------------------------------------------------------------- /include/aluminum/mpi_cuda/rma_self.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/mpi_cuda/communicator.hpp" 31 | #include "aluminum/mpi_cuda/rma.hpp" 32 | 33 | namespace Al { 34 | namespace internal { 35 | namespace mpi_cuda { 36 | 37 | class ConnectionSelf: public Connection { 38 | public: 39 | ConnectionSelf(MPICUDACommunicator &comm, int peer): 40 | Connection(comm, peer) {} 41 | ~ConnectionSelf() {} 42 | void connect() {} 43 | void disconnect() {} 44 | void *attach_remote_buffer(void *local_addr) { 45 | return local_addr; 46 | } 47 | void detach_remote_buffer(void *) {} 48 | void detach_all_remote_buffers() {} 49 | void notify(mpi::AlMPIReq &req) { 50 | req->store(true, std::memory_order_release); 51 | } 52 | void wait(mpi::AlMPIReq &req) { 53 | req->store(true, std::memory_order_release); 54 | } 55 | void sync(mpi::AlMPIReq &req) { 56 | req->store(true, std::memory_order_release); 57 | } 58 | void put(const void *src, void *dst, 59 | size_t size) { 60 | if (size > 0) { 61 | if (src == nullptr) { 62 | throw_al_exception("Source buffer is null"); 63 | } 64 | if (dst == nullptr) { 65 | throw_al_exception("Destination buffer is null"); 66 | } 67 | AL_CHECK_CUDA(cudaMemcpyAsync( 68 | dst, src, size, cudaMemcpyDefault, m_comm.get_stream())); 69 | } 70 | } 71 | }; 72 | 73 | } // namespace mpi_cuda 74 | } // namespace internal 75 | } // namespace Al 76 | -------------------------------------------------------------------------------- /src/cuda/helper_kernels.cu: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include 29 | 30 | #if defined AL_HAS_ROCM 31 | #include 32 | #elif defined AL_HAS_CUDA 33 | #include 34 | #endif 35 | 36 | #include "aluminum/cuda/helper_kernels.hpp" 37 | 38 | namespace Al { 39 | namespace internal { 40 | namespace cuda { 41 | 42 | __global__ void spin_wait_kernel(int32_t wait_value, volatile int32_t* wait_mem) { 43 | for (;;) 44 | { 45 | __threadfence_system(); 46 | int32_t value = *wait_mem; 47 | if (value == wait_value) break; 48 | } 49 | } 50 | 51 | void launch_wait_kernel(AlGpuStream_t stream, 52 | int32_t wait_value, 53 | volatile int32_t* wait_mem) { 54 | spin_wait_kernel<<<1,1,0,stream>>>(wait_value, wait_mem); 55 | } 56 | 57 | #if defined AL_HAS_ROCM 58 | void launch_wait_kernel(hipStream_t stream, 59 | int32_t wait_value, 60 | hipDeviceptr_t wait_mem) { 61 | AL_CHECK_CUDA(hipStreamWaitValue32( 62 | stream, wait_mem, wait_value, hipStreamWaitValueEq)); 63 | } 64 | #elif defined AL_HAS_CUDA 65 | void launch_wait_kernel(cudaStream_t stream, 66 | int32_t wait_value, 67 | CUdeviceptr wait_mem) { 68 | AL_CHECK_CUDA_DRV(cuStreamWaitValue32( 69 | stream, wait_mem, wait_value, CU_STREAM_WAIT_VALUE_EQ)); 70 | } 71 | #endif 72 | 73 | } // namespace cuda 74 | } // namespace internal 75 | } // namespace Al 76 | -------------------------------------------------------------------------------- /include/aluminum/cuda/gpu_wait.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | #if defined AL_HAS_ROCM 35 | #include 36 | #elif defined AL_HAS_CUDA 37 | #include 38 | #include 39 | #endif 40 | 41 | namespace Al { 42 | namespace internal { 43 | namespace cuda { 44 | 45 | /** 46 | * Have a GPU stream block until signalled. 47 | * This essentially uses full/empty bit semantics to implement synchronization. 48 | * The GPU will wait on a memory location until the host writes to it using the 49 | * stream memory wait operation. 50 | * 51 | * If stream memory operations are not available, this will use a 52 | * spinning wait kernel. This can cause problems. It has a tendency to 53 | * lead to deadlock, especially in "debug" mode. Also, if kernel 54 | * timeout is enabled, this is likely to error out. 55 | */ 56 | class GPUWait { 57 | public: 58 | GPUWait(); 59 | ~GPUWait(); 60 | /** Enqueue a wait onto stream. */ 61 | void wait(AlGpuStream_t stream); 62 | /** Signal the stream to continue. */ 63 | void signal(); 64 | private: 65 | int32_t* wait_sync __attribute__((aligned(64))); 66 | union { 67 | int32_t *wait_sync_dev_ptr_no_stream_mem_ops __attribute__((aligned(64))); 68 | #if defined AL_HAS_ROCM 69 | hipDeviceptr_t wait_sync_dev_ptr; 70 | #elif defined AL_HAS_CUDA 71 | CUdeviceptr wait_sync_dev_ptr; 72 | #endif 73 | }; 74 | }; 75 | 76 | } // namespace cuda 77 | } // namespace internal 78 | } // namespace Al 79 | -------------------------------------------------------------------------------- /src/cuda/gpu_wait.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/cuda/gpu_wait.hpp" 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/cuda/helper_kernels.hpp" 32 | #include "aluminum/cuda/sync_memory.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace cuda { 37 | 38 | GPUWait::GPUWait() 39 | : wait_sync(sync_pool.get()) 40 | { 41 | // An atomic here may be overkill. 42 | // Can't use std::atomic because we need the actual address. 43 | __atomic_store_n(wait_sync, 0, __ATOMIC_SEQ_CST); 44 | 45 | if (stream_memory_operations_supported()) { 46 | #if defined AL_HAS_ROCM 47 | AL_CHECK_CUDA( 48 | hipHostGetDevicePointer(&wait_sync_dev_ptr, wait_sync, 0)); 49 | #elif defined AL_HAS_CUDA 50 | AL_CHECK_CUDA_DRV( 51 | cuMemHostGetDevicePointer(&wait_sync_dev_ptr, wait_sync, 0)); 52 | #endif 53 | } else { 54 | AL_CHECK_CUDA(AlGpuHostGetDevicePointer( 55 | reinterpret_cast(&wait_sync_dev_ptr_no_stream_mem_ops), 56 | wait_sync, 0)); 57 | } 58 | } 59 | 60 | GPUWait::~GPUWait() { 61 | sync_pool.release(wait_sync); 62 | } 63 | 64 | void GPUWait::wait(AlGpuStream_t stream) { 65 | if (stream_memory_operations_supported()) { 66 | launch_wait_kernel(stream, 1, wait_sync_dev_ptr); 67 | } else { 68 | launch_wait_kernel(stream, 1, wait_sync_dev_ptr_no_stream_mem_ops); 69 | } 70 | } 71 | 72 | void GPUWait::signal() { 73 | __atomic_store_n(wait_sync, 1, __ATOMIC_SEQ_CST); 74 | } 75 | 76 | } // namespace cuda 77 | } // namespace internal 78 | } // namespace Al 79 | -------------------------------------------------------------------------------- /benchmark/wait.cu: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | #include 28 | 29 | #if defined AL_HAS_ROCM 30 | #include 31 | #elif defined AL_HAS_CUDA 32 | #include 33 | #include 34 | #endif 35 | 36 | namespace { 37 | 38 | __global__ void wait_kernel(long long int cycles) { 39 | // Doesn't handle the clock wrapping. 40 | // Seems to wait longer than expected, but not an issue right now. 41 | const long long int start = clock64(); 42 | long long int cur; 43 | do { 44 | cur = clock64(); 45 | } while (cur - start < cycles); 46 | } 47 | 48 | } // anonymous namespace 49 | 50 | #if defined AL_HAS_ROCM 51 | #define AlGpuDevAttrClockRate hipDeviceAttributeClockRate 52 | #elif defined AL_HAS_CUDA 53 | #define AlGpuDevAttrClockRate cudaDevAttrClockRate 54 | #endif 55 | 56 | void gpu_wait(double length, AlGpuStream_t stream) { 57 | // Need to figure out frequency to convert seconds to cycles. 58 | // Might not be exactly accurate (especially w/ dynamic frequencies). 59 | // Cache this (unlikely we run on devices with different frequencies.) 60 | static long long int freq_hz = 0; 61 | if (freq_hz == 0) { 62 | int device, freq_khz; 63 | static_cast(AlGpuGetDevice(&device)); 64 | static_cast(AlGpuDeviceGetAttribute(&freq_khz, 65 | AlGpuDevAttrClockRate, 66 | device)); 67 | freq_hz = (long long int) freq_khz * 1000; // Convert from KHz. 68 | } 69 | double cycles = length * freq_hz; 70 | wait_kernel<<<1, 1, 0, stream>>>((long long int) cycles); 71 | } 72 | -------------------------------------------------------------------------------- /include/aluminum/mpi/base_state.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | #include 33 | #include "aluminum/progress.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | using AlMPIReq = std::shared_ptr>; 40 | 41 | /** Return a free request for use. */ 42 | inline AlMPIReq get_free_request() { 43 | return std::make_shared>(false); 44 | } 45 | 46 | class MPIState : public AlState { 47 | public: 48 | MPIState(AlMPIReq req_) : req(req_) {} 49 | 50 | void start() override { 51 | AlState::start(); 52 | start_mpi_op(); 53 | } 54 | 55 | PEAction step() override { 56 | if (poll_mpi()) { 57 | // Mark the request as completed. 58 | req->store(true, std::memory_order_release); 59 | return PEAction::complete; 60 | } else { 61 | return PEAction::cont; 62 | } 63 | } 64 | 65 | protected: 66 | /** Start the MPI operation and set the request. */ 67 | virtual void start_mpi_op() = 0; 68 | /** Return the MPI request that will be polled on. */ 69 | MPI_Request* get_mpi_req() { return &mpi_req; } 70 | /** Return true when the MPI operation is complete. */ 71 | virtual bool poll_mpi() { 72 | int flag; 73 | MPI_Test(get_mpi_req(), &flag, MPI_STATUS_IGNORE); 74 | return flag; 75 | } 76 | 77 | private: 78 | /** Copy of the user's request object. */ 79 | AlMPIReq req; 80 | /** MPI request associated with the operation. */ 81 | MPI_Request mpi_req = MPI_REQUEST_NULL; 82 | }; 83 | 84 | } // namespace mpi 85 | } // namespace internal 86 | } // namespace Al 87 | -------------------------------------------------------------------------------- /test/test_utils_mpi_cuda.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "Al.hpp" 31 | 32 | #include "test_utils.hpp" 33 | #include "test_utils_mpi.hpp" 34 | #include "cuda_vector.hpp" 35 | 36 | 37 | template 38 | struct VectorType { 39 | using type = CUDAVector; 40 | 41 | static type gen_data(size_t count, cudaStream_t stream = 0) { 42 | auto&& host_data = VectorType::gen_data(count); 43 | CUDAVector data(host_data, stream); 44 | return data; 45 | } 46 | 47 | static std::vector copy_to_host(const type& v) { 48 | return v.copyout(); 49 | } 50 | }; 51 | 52 | // Specialize to use the Aluminum stream pool, and size it appropriately. 53 | template <> 54 | struct StreamManager { 55 | using StreamType = cudaStream_t; 56 | 57 | static void init(size_t num_streams) { 58 | Al::internal::cuda::stream_pool.clear(); 59 | Al::internal::cuda::stream_pool.allocate(num_streams); 60 | } 61 | static void finalize() {} 62 | static StreamType get_stream() { 63 | return Al::internal::cuda::stream_pool.get_stream(); 64 | } 65 | }; 66 | 67 | // Specialize to create a CUDA stream with the communicator. 68 | template <> 69 | CommWrapper::CommWrapper(MPI_Comm mpi_comm) { 70 | comm_ = std::make_unique( 71 | mpi_comm, StreamManager::get_stream()); 72 | } 73 | 74 | template <> 75 | void complete_operations( 76 | typename Al::MPICUDABackend::comm_type& comm) { 77 | AL_FORCE_CHECK_GPU_NOSYNC(cudaStreamSynchronize(comm.get_stream())); 78 | } 79 | -------------------------------------------------------------------------------- /include/aluminum/cuda/cuda_mempool.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include "aluminum/utils/caching_allocator.hpp" 33 | #include "aluminum/cuda/cuda.hpp" 34 | #if defined AL_HAS_ROCM 35 | #include 36 | #define AL_CUB_NS hipcub 37 | #elif defined AL_HAS_CUDA 38 | #include 39 | #define AL_CUB_NS cub 40 | #endif 41 | 42 | namespace Al { 43 | namespace internal { 44 | 45 | /** Allocator for pinned host memory. */ 46 | struct CUDAPinnedMemoryAllocator { 47 | void* allocate(size_t bytes) { 48 | void* ptr; 49 | AL_CHECK_CUDA(AlGpuMallocHost(&ptr, bytes)); 50 | return ptr; 51 | } 52 | 53 | void deallocate(void* ptr) { 54 | AL_CHECK_CUDA(AlGpuFreeHost(ptr)); 55 | } 56 | }; 57 | 58 | /** Specialized caching allocator for CUDA using CUB. */ 59 | template <> 60 | class CachingAllocator { 61 | public: 62 | CachingAllocator() : cub_pool(2u) {} 63 | 64 | ~CachingAllocator() { 65 | clear(); 66 | } 67 | 68 | template 69 | T* allocate(size_t size, AlGpuStream_t stream) { 70 | T* mem = nullptr; 71 | AL_CHECK_CUDA(cub_pool.DeviceAllocate(reinterpret_cast(&mem), 72 | sizeof(T)*size, stream)); 73 | return mem; 74 | } 75 | 76 | template 77 | void release(T* ptr) { 78 | AL_CHECK_CUDA(cub_pool.DeviceFree(ptr)); 79 | } 80 | 81 | void clear() { AL_IGNORE_NODISCARD(cub_pool.FreeAllCached()); } 82 | 83 | private: 84 | AL_CUB_NS::CachingDeviceAllocator cub_pool; 85 | }; 86 | 87 | } // namespace internal 88 | } // namespace Al 89 | -------------------------------------------------------------------------------- /util/al_info.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include 29 | #include 30 | 31 | #include 32 | 33 | 34 | int main(int, char**) { 35 | std::cout << "Aluminum " 36 | << AL_VERSION 37 | << " (" << AL_GIT_VERSION << ")\n"; 38 | 39 | std::cout << "Backends:"; 40 | std::cout << " mpi"; // MPI is always present. 41 | #ifdef AL_HAS_NCCL 42 | std::cout << " nccl"; 43 | #endif 44 | #ifdef AL_HAS_HOST_TRANSFER 45 | std::cout << " ht"; 46 | #endif 47 | #ifdef AL_HAS_MPI_CUDA 48 | std::cout << " mpi-cuda"; 49 | #endif 50 | std::cout << "\n"; 51 | std::cout << "Features:"; 52 | #ifdef AL_DEBUG 53 | std::cout << " debug"; 54 | #endif 55 | #ifdef AL_THREAD_MULTIPLE 56 | std::cout << " thread-multiple"; 57 | #endif 58 | #ifdef AL_HAS_CUDA 59 | std::cout << " cuda"; 60 | #endif 61 | #ifdef AL_HAS_ROCM 62 | std::cout << " rocm"; 63 | #endif 64 | #ifdef AL_HAS_MPI_CUDA_RMA 65 | std::cout << " mpi-cuda-rma"; 66 | #endif 67 | #ifdef AL_DEBUG_HANG_CHECK 68 | std::cout << " hang-check"; 69 | #endif 70 | #ifdef AL_HAS_PROF 71 | std::cout << " prof"; 72 | #endif 73 | #ifdef AL_HAS_NVPROF 74 | std::cout << " nvprof"; 75 | #endif 76 | #ifdef AL_HAS_ROCTRACER 77 | std::cout << " roctracer"; 78 | #endif 79 | #ifdef AL_TRACE 80 | std::cout << " trace"; 81 | #endif 82 | #ifdef AL_MPI_SERIALIZE 83 | std::cout << " mpi-serialize"; 84 | #endif 85 | #ifdef AL_HAS_HALF 86 | std::cout << " half"; 87 | #endif 88 | #ifdef AL_HAS_BFLOAT 89 | std::cout << " bfloat"; 90 | #endif 91 | #ifdef AL_HAS_LARGE_COUNT_MPI 92 | std::cout << " mpi-large-count"; 93 | #endif 94 | std::cout << std::endl; 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /include/aluminum/cuda/gpu_status_flag.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | #if defined AL_HAS_ROCM 35 | #include 36 | #elif defined AL_HAS_CUDA 37 | #include 38 | #include 39 | #endif 40 | 41 | namespace Al { 42 | namespace internal { 43 | namespace cuda { 44 | 45 | /** 46 | * An optimized version of CUDA events that only supports polling from the host. 47 | * This essentially uses full/empty bit semantics to implement synchronization. 48 | * A memory location is polled on by the host and written to by the device 49 | * using the stream memory write operation. 50 | * This falls back to the usual CUDA events when stream memory operations are 51 | * not available. 52 | * @note This is currently always falling back on CUDA events to work around a 53 | * hang, the underlying cause of which has not been diagnosed. 54 | */ 55 | class GPUStatusFlag { 56 | public: 57 | /** 58 | * Allocate the event. 59 | */ 60 | GPUStatusFlag(); 61 | ~GPUStatusFlag(); 62 | /** Record the event into stream. */ 63 | void record(AlGpuStream_t stream); 64 | /** Return true if the event has completed. */ 65 | bool query(); 66 | private: 67 | struct stream_mem_t { 68 | int32_t* sync_event __attribute__((aligned(64))); 69 | #if defined AL_HAS_ROCM 70 | hipDeviceptr_t sync_event_dev_ptr; 71 | #elif defined AL_HAS_CUDA 72 | CUdeviceptr sync_event_dev_ptr; 73 | #endif 74 | }; 75 | union { 76 | stream_mem_t stream_mem; 77 | AlGpuEvent_t plain_event; 78 | }; 79 | }; 80 | 81 | } // namespace cuda 82 | } // namespace internal 83 | } // namespace Al 84 | -------------------------------------------------------------------------------- /include/aluminum/mpi/bcast.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | template 40 | void passthrough_bcast(T* buf, size_t count, int root, 41 | MPICommunicator& comm) { 42 | AL_MPI_LARGE_COUNT_CALL(MPI_Bcast)( 43 | buf, count, TypeMap(), root, comm.get_comm()); 44 | } 45 | 46 | template 47 | class BcastAlState : public MPIState { 48 | public: 49 | BcastAlState(T* buf_, size_t count_, int root_, 50 | MPICommunicator& comm_, AlMPIReq req_) : 51 | MPIState(req_), 52 | buf(buf_), count(count_), root(root_), 53 | comm(comm_.get_comm()) {} 54 | 55 | ~BcastAlState() override {} 56 | 57 | std::string get_name() const override { return "MPIBcast"; } 58 | 59 | protected: 60 | void start_mpi_op() override { 61 | AL_MPI_LARGE_COUNT_CALL(MPI_Ibcast)( 62 | buf, count, TypeMap(), root, comm, get_mpi_req()); 63 | } 64 | 65 | private: 66 | T* buf; 67 | size_t count; 68 | int root; 69 | MPI_Comm comm; 70 | }; 71 | 72 | template 73 | void passthrough_nb_bcast(T* buf, size_t count, int root, 74 | MPICommunicator& comm, AlMPIReq& req) { 75 | req = get_free_request(); 76 | internal::mpi::BcastAlState* state = 77 | new internal::mpi::BcastAlState( 78 | buf, count, root, comm, req); 79 | get_progress_engine()->enqueue(state); 80 | } 81 | 82 | } // namespace mpi 83 | } // namespace internal 84 | } // namespace Al 85 | -------------------------------------------------------------------------------- /include/aluminum/ht/communicator.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | #include "aluminum/mpi_comm_and_stream_wrapper.hpp" 33 | #include "aluminum/mpi/communicator.hpp" 34 | #include "aluminum/mpi/utils.hpp" 35 | 36 | namespace Al { 37 | namespace internal { 38 | namespace ht { 39 | 40 | // Define the tag that point-to-point operations will use. 41 | constexpr int pt2pt_tag = 2; 42 | 43 | /** Communicator for host-transfer operations. */ 44 | class HostTransferCommunicator: public MPICommAndStreamWrapper { 45 | public: 46 | /** Use Aluminum's world and the default CUDA stream. */ 47 | HostTransferCommunicator() 48 | : HostTransferCommunicator(mpi::get_world_comm().get_comm(), 0) {} 49 | /** Use a particular MPI communicator and stream. */ 50 | HostTransferCommunicator(MPI_Comm comm_, AlGpuStream_t stream_ = 0) 51 | : MPICommAndStreamWrapper(comm_, stream_) {} 52 | /** Cannot copy this. */ 53 | HostTransferCommunicator(const HostTransferCommunicator& other) = delete; 54 | /** Default move constructor. */ 55 | HostTransferCommunicator(HostTransferCommunicator&& other) = default; 56 | /** Cannot copy this. */ 57 | HostTransferCommunicator& operator=(const HostTransferCommunicator& other) = delete; 58 | /** Default move assignment operator. */ 59 | HostTransferCommunicator& operator=(HostTransferCommunicator&& other) = default; 60 | ~HostTransferCommunicator() {} 61 | 62 | /** 63 | * Create a new HostTransfer communicator with the same processes 64 | * and a new stream. 65 | */ 66 | HostTransferCommunicator copy(AlGpuStream_t stream = 0) { 67 | return HostTransferCommunicator(get_comm(), stream); 68 | } 69 | }; 70 | 71 | } // namespace ht 72 | } // namespace internal 73 | } // namespace Al 74 | -------------------------------------------------------------------------------- /test/test_utils_nccl.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "Al.hpp" 31 | #include "aluminum/traits/traits.hpp" 32 | 33 | #include "test_utils.hpp" 34 | #include "test_utils_mpi.hpp" 35 | #include "cuda_vector.hpp" 36 | 37 | 38 | template 39 | struct VectorType { 40 | using type = CUDAVector; 41 | 42 | static type gen_data(size_t count, AlGpuStream_t stream = 0) { 43 | auto&& host_data = VectorType::gen_data(count); 44 | CUDAVector data(host_data, stream); 45 | return data; 46 | } 47 | 48 | static std::vector copy_to_host(const type& v) { 49 | return v.copyout(); 50 | } 51 | }; 52 | 53 | // Specialize to use the Aluminum stream pool, and size it appropriately. 54 | template <> 55 | struct StreamManager { 56 | using StreamType = AlGpuStream_t; 57 | 58 | static void init(size_t num_streams) { 59 | get_stream_pool().allocate(num_streams); 60 | } 61 | static void finalize() { 62 | get_stream_pool().clear(); 63 | } 64 | static StreamType get_stream() { 65 | return get_stream_pool().get_stream(); 66 | } 67 | 68 | private: 69 | static Al::internal::cuda::StreamPool& get_stream_pool() { 70 | static Al::internal::cuda::StreamPool streams; 71 | return streams; 72 | } 73 | }; 74 | 75 | // Specialize to create a CUDA stream with the communicator. 76 | template <> 77 | CommWrapper::CommWrapper(MPI_Comm mpi_comm) { 78 | comm_ = std::make_unique( 79 | mpi_comm, StreamManager::get_stream()); 80 | } 81 | 82 | template <> 83 | void complete_operations( 84 | typename Al::NCCLBackend::comm_type& comm) { 85 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuStreamSynchronize(comm.get_stream())); 86 | } 87 | -------------------------------------------------------------------------------- /test/test_utils_ht.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "Al.hpp" 31 | #include "aluminum/traits/traits.hpp" 32 | 33 | #include "test_utils.hpp" 34 | #include "test_utils_mpi.hpp" 35 | #include "cuda_vector.hpp" 36 | 37 | 38 | template 39 | struct VectorType { 40 | using type = CUDAVector; 41 | 42 | static type gen_data(size_t count, AlGpuStream_t stream = 0) { 43 | auto&& host_data = VectorType::gen_data(count); 44 | CUDAVector data(host_data, stream); 45 | return data; 46 | } 47 | 48 | static std::vector copy_to_host(const type& v) { 49 | return v.copyout(); 50 | } 51 | }; 52 | 53 | // Specialize to use the Aluminum stream pool, and size it appropriately. 54 | template <> 55 | struct StreamManager { 56 | using StreamType = AlGpuStream_t; 57 | 58 | static void init(size_t num_streams) { 59 | get_stream_pool().allocate(num_streams); 60 | } 61 | static void finalize() { 62 | get_stream_pool().clear(); 63 | } 64 | static StreamType get_stream() { 65 | return get_stream_pool().get_stream(); 66 | } 67 | 68 | private: 69 | static Al::internal::cuda::StreamPool& get_stream_pool() { 70 | static Al::internal::cuda::StreamPool streams; 71 | return streams; 72 | } 73 | }; 74 | 75 | // Specialize to create a CUDA stream with the communicator. 76 | template <> 77 | CommWrapper::CommWrapper(MPI_Comm mpi_comm) { 78 | comm_ = std::make_unique( 79 | mpi_comm, StreamManager::get_stream()); 80 | } 81 | 82 | template <> 83 | void complete_operations( 84 | typename Al::HostTransferBackend::comm_type& comm) { 85 | AL_FORCE_CHECK_GPU_NOSYNC(AlGpuStreamSynchronize(comm.get_stream())); 86 | } 87 | -------------------------------------------------------------------------------- /include/aluminum/ht/alltoall.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | template 39 | class AlltoallAlState : public HostTransferCollectiveSignalAtEndState { 40 | public: 41 | AlltoallAlState(const T* sendbuf, T* recvbuf, size_t count_, 42 | HostTransferCommunicator& comm_, AlGpuStream_t stream_) : 43 | HostTransferCollectiveSignalAtEndState(stream_), 44 | host_mem(mempool.allocate(comm_.size()*count_)), 45 | count(count_), 46 | comm(comm_.get_comm()) { 47 | // Transfer data from device to host. 48 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count*comm_.size(), 49 | AlGpuMemcpyDeviceToHost, stream_)); 50 | start_event.record(stream_); 51 | 52 | // Have the device wait on the host. 53 | gpu_wait.wait(stream_); 54 | 55 | // Transfer completed buffer back to device. 56 | AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count*comm_.size(), 57 | AlGpuMemcpyHostToDevice, stream_)); 58 | end_event.record(stream_); 59 | } 60 | 61 | ~AlltoallAlState() override { 62 | mempool.release(host_mem); 63 | } 64 | 65 | std::string get_name() const override { return "HTAlltoall"; } 66 | 67 | protected: 68 | void start_mpi_op() override { 69 | AL_MPI_LARGE_COUNT_CALL(MPI_Ialltoall)( 70 | MPI_IN_PLACE, count, mpi::TypeMap(), 71 | host_mem, count, mpi::TypeMap(), comm, get_mpi_req()); 72 | } 73 | 74 | private: 75 | T* host_mem; 76 | size_t count; 77 | MPI_Comm comm; 78 | }; 79 | 80 | } // namespace ht 81 | } // namespace internal 82 | } // namespace Al 83 | -------------------------------------------------------------------------------- /include/aluminum/mpi/alltoall.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | template 40 | void passthrough_alltoall(const T* sendbuf, T* recvbuf, size_t count, 41 | MPICommunicator& comm) { 42 | AL_MPI_LARGE_COUNT_CALL(MPI_Alltoall)( 43 | buf_or_inplace(sendbuf), count, TypeMap(), 44 | recvbuf, count, TypeMap(), comm.get_comm()); 45 | } 46 | 47 | template 48 | class AlltoallAlState : public MPIState { 49 | public: 50 | AlltoallAlState(const T* sendbuf_, T* recvbuf_, size_t count_, 51 | MPICommunicator& comm_, AlMPIReq req_) : 52 | MPIState(req_), 53 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), 54 | comm(comm_.get_comm()) {} 55 | 56 | ~AlltoallAlState() override {} 57 | 58 | std::string get_name() const override { return "MPIAlltoall"; } 59 | 60 | protected: 61 | void start_mpi_op() override { 62 | AL_MPI_LARGE_COUNT_CALL(MPI_Ialltoall)( 63 | buf_or_inplace(sendbuf), count, TypeMap(), 64 | recvbuf, count, TypeMap(), comm, get_mpi_req()); 65 | } 66 | 67 | private: 68 | const T* sendbuf; 69 | T* recvbuf; 70 | size_t count; 71 | MPI_Comm comm; 72 | }; 73 | 74 | template 75 | void passthrough_nb_alltoall(const T* sendbuf, T* recvbuf, size_t count, 76 | MPICommunicator& comm, AlMPIReq& req) { 77 | req = get_free_request(); 78 | internal::mpi::AlltoallAlState* state = 79 | new internal::mpi::AlltoallAlState( 80 | sendbuf, recvbuf, count, comm, req); 81 | get_progress_engine()->enqueue(state); 82 | } 83 | 84 | } // namespace mpi 85 | } // namespace internal 86 | } // namespace Al 87 | -------------------------------------------------------------------------------- /include/aluminum/mpi/allgather.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | template 40 | void passthrough_allgather(const T* sendbuf, T* recvbuf, size_t count, 41 | MPICommunicator& comm) { 42 | AL_MPI_LARGE_COUNT_CALL(MPI_Allgather)( 43 | buf_or_inplace(sendbuf), count, TypeMap(), 44 | recvbuf, count, TypeMap(), comm.get_comm()); 45 | } 46 | 47 | template 48 | class AllgatherAlState : public MPIState { 49 | public: 50 | AllgatherAlState(const T* sendbuf_, T* recvbuf_, size_t count_, 51 | MPICommunicator& comm_, AlMPIReq req_) : 52 | MPIState(req_), 53 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), 54 | comm(comm_.get_comm()) {} 55 | 56 | ~AllgatherAlState() override {} 57 | 58 | std::string get_name() const override { return "MPIAllgather"; } 59 | 60 | protected: 61 | void start_mpi_op() override { 62 | AL_MPI_LARGE_COUNT_CALL(MPI_Iallgather)( 63 | buf_or_inplace(sendbuf), count, TypeMap(), 64 | recvbuf, count, TypeMap(), comm, get_mpi_req()); 65 | } 66 | 67 | private: 68 | const T* sendbuf; 69 | T* recvbuf; 70 | size_t count; 71 | MPI_Comm comm; 72 | }; 73 | 74 | template 75 | void passthrough_nb_allgather(const T* sendbuf, T* recvbuf, size_t count, 76 | MPICommunicator& comm, AlMPIReq& req) { 77 | req = get_free_request(); 78 | internal::mpi::AllgatherAlState* state = 79 | new internal::mpi::AllgatherAlState( 80 | sendbuf, recvbuf, count, comm, req); 81 | get_progress_engine()->enqueue(state); 82 | } 83 | 84 | } // namespace mpi 85 | } // namespace internal 86 | } // namespace Al 87 | -------------------------------------------------------------------------------- /include/aluminum/ht/bcast.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | template 39 | class BcastAlState : public HostTransferCollectiveSignalRootEarlyState { 40 | public: 41 | BcastAlState(T* buf, size_t count_, int root_, 42 | HostTransferCommunicator& comm_, AlGpuStream_t stream_) : 43 | HostTransferCollectiveSignalRootEarlyState(comm_.rank() == root_, stream_), 44 | host_mem(mempool.allocate(count_)), 45 | count(count_), 46 | root(root_), 47 | comm(comm_.get_comm()) { 48 | // Transfer data from device to host. 49 | if (is_root) { 50 | AL_CHECK_CUDA(AlGpuMemcpyAsync( 51 | host_mem, buf, sizeof(T)*count, 52 | AlGpuMemcpyDeviceToHost, stream_)); 53 | } 54 | start_event.record(stream_); 55 | 56 | // Have the device wait on the host. 57 | gpu_wait.wait(stream_); 58 | 59 | if (!is_root) { 60 | // Transfer completed buffer back to device. 61 | AL_CHECK_CUDA(AlGpuMemcpyAsync(buf, host_mem, sizeof(T)*count, 62 | AlGpuMemcpyHostToDevice, stream_)); 63 | } 64 | end_event.record(stream_); 65 | } 66 | 67 | ~BcastAlState() override { 68 | mempool.release(host_mem); 69 | } 70 | 71 | std::string get_name() const override { return "HTBcast"; } 72 | 73 | protected: 74 | void start_mpi_op() override { 75 | AL_MPI_LARGE_COUNT_CALL(MPI_Ibcast)( 76 | host_mem, count, mpi::TypeMap(), root, comm, get_mpi_req()); 77 | } 78 | 79 | private: 80 | T* host_mem; 81 | size_t count; 82 | int root; 83 | MPI_Comm comm; 84 | }; 85 | 86 | } // namespace ht 87 | } // namespace internal 88 | } // namespace Al 89 | -------------------------------------------------------------------------------- /src/profiling.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/profiling.hpp" 29 | 30 | #include 31 | 32 | #include 33 | 34 | #ifdef AL_HAS_NVPROF 35 | #include 36 | #include 37 | #endif 38 | 39 | namespace Al { 40 | namespace internal { 41 | namespace profiling { 42 | 43 | void name_thread([[maybe_unused]] std::thread::native_handle_type handle, 44 | [[maybe_unused]] std::string name) { 45 | #ifdef AL_HAS_NVPROF 46 | nvtxNameOsThreadA(handle, name.c_str()); 47 | #endif 48 | #ifdef _GNU_SOURCE 49 | // Subtract 1 to account for the terminating null. 50 | std::string name_resized = name.substr(0, AL_MAX_THREAD_NAME_LEN - 1); 51 | pthread_setname_np(handle, name_resized.c_str()); 52 | #endif 53 | } 54 | 55 | #ifdef AL_HAS_CUDA 56 | void name_stream(AlGpuStream_t stream, std::string name) { 57 | #ifdef AL_HAS_NVPROF 58 | nvtxNameCudaStreamA(stream, name.c_str()); 59 | #else 60 | (void) stream; 61 | (void) name; 62 | #endif 63 | } 64 | #endif 65 | 66 | void mark(std::string desc) { 67 | (void) desc; 68 | #ifdef AL_HAS_NVPROF 69 | nvtxMarkA(desc.c_str()); 70 | #endif 71 | #ifdef AL_HAS_ROCTRACER 72 | roctxMark(desc.c_str()); 73 | #endif 74 | } 75 | 76 | ProfileRange prof_start(std::string name) { 77 | (void) name; 78 | ProfileRange range; 79 | #ifdef AL_HAS_NVPROF 80 | range.nvtx_range = nvtxRangeStartA(name.c_str()); 81 | #endif 82 | #ifdef AL_HAS_ROCTRACER 83 | range.roctx_range = roctxRangeStart(name.c_str()); 84 | #endif 85 | return range; 86 | } 87 | 88 | void prof_end(ProfileRange range) { 89 | (void) range; 90 | #ifdef AL_HAS_NVPROF 91 | nvtxRangeEnd(range.nvtx_range); 92 | #endif 93 | #ifdef AL_HAS_ROCTRACER 94 | roctxRangeStop(range.roctx_range); 95 | #endif 96 | } 97 | 98 | } // namespace profiling 99 | } // namespace internal 100 | } // namespace Al 101 | -------------------------------------------------------------------------------- /include/aluminum/ht/reduce_scatter.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | template 39 | class ReduceScatterAlState : public HostTransferCollectiveSignalAtEndState { 40 | public: 41 | ReduceScatterAlState(const T* sendbuf, T* recvbuf, size_t count_, 42 | ReductionOperator op_, HostTransferCommunicator& comm_, 43 | AlGpuStream_t stream_) : 44 | HostTransferCollectiveSignalAtEndState(stream_), 45 | host_mem(mempool.allocate(comm_.size()*count_)), 46 | count(count_), 47 | op(mpi::ReductionOperator2MPI_Op(op_)), 48 | comm(comm_.get_comm()) { 49 | // Transfer data from device to host. 50 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count*comm_.size(), 51 | AlGpuMemcpyDeviceToHost, stream_)); 52 | start_event.record(stream_); 53 | 54 | // Have the device wait on the host. 55 | gpu_wait.wait(stream_); 56 | 57 | // Transfer completed buffer back to device. 58 | AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count, 59 | AlGpuMemcpyHostToDevice, stream_)); 60 | end_event.record(stream_); 61 | } 62 | 63 | ~ReduceScatterAlState() override { 64 | mempool.release(host_mem); 65 | } 66 | 67 | std::string get_name() const override { return "HTReduceScatter"; } 68 | 69 | protected: 70 | void start_mpi_op() override { 71 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter_block)( 72 | MPI_IN_PLACE, host_mem, count, 73 | mpi::TypeMap(), op, comm, get_mpi_req()); 74 | } 75 | 76 | private: 77 | T* host_mem; 78 | size_t count; 79 | MPI_Op op; 80 | MPI_Comm comm; 81 | }; 82 | 83 | } // namespace ht 84 | } // namespace internal 85 | } // namespace Al 86 | -------------------------------------------------------------------------------- /include/aluminum/mpi/allreduce.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | template 40 | void passthrough_allreduce(const T* sendbuf, T* recvbuf, size_t count, 41 | ReductionOperator op, MPICommunicator& comm) { 42 | AL_MPI_LARGE_COUNT_CALL(MPI_Allreduce)( 43 | buf_or_inplace(sendbuf), recvbuf, count, TypeMap(), 44 | ReductionOperator2MPI_Op(op), comm.get_comm()); 45 | } 46 | 47 | template 48 | class AllreduceAlState : public MPIState { 49 | public: 50 | AllreduceAlState(const T* sendbuf_, T* recvbuf_, size_t count_, 51 | ReductionOperator op_, MPICommunicator& comm_, AlMPIReq req_) : 52 | MPIState(req_), 53 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), 54 | op(ReductionOperator2MPI_Op(op_)), comm(comm_.get_comm()) {} 55 | 56 | ~AllreduceAlState() override {} 57 | 58 | std::string get_name() const override { return "MPIAllreduce"; } 59 | 60 | protected: 61 | void start_mpi_op() override { 62 | AL_MPI_LARGE_COUNT_CALL(MPI_Iallreduce)( 63 | buf_or_inplace(sendbuf), recvbuf, count, TypeMap(), op, 64 | comm, get_mpi_req()); 65 | } 66 | 67 | private: 68 | const T* sendbuf; 69 | T* recvbuf; 70 | size_t count; 71 | MPI_Op op; 72 | MPI_Comm comm; 73 | }; 74 | 75 | template 76 | void passthrough_nb_allreduce(const T* sendbuf, T* recvbuf, size_t count, 77 | ReductionOperator op, MPICommunicator& comm, 78 | AlMPIReq& req) { 79 | req = get_free_request(); 80 | internal::mpi::AllreduceAlState* state = 81 | new internal::mpi::AllreduceAlState( 82 | sendbuf, recvbuf, count, op, comm, req); 83 | get_progress_engine()->enqueue(state); 84 | } 85 | 86 | } // namespace mpi 87 | } // namespace internal 88 | } // namespace Al 89 | -------------------------------------------------------------------------------- /cmake/AluminumConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Aluminum currently has 4 known components: MPI, NCCL, HOST_TRANSFER, 2 | # and MPI_CUDA. "MPI" is always available. The others are found if 3 | # AL_HAS_. 4 | include(CMakeFindDependencyMacro) 5 | 6 | list(APPEND CMAKE_MODULE_PATH "@CMAKE_MODULE_LOCATION@") 7 | 8 | include(${CMAKE_CURRENT_LIST_DIR}/AluminumConfigVersion.cmake) 9 | set(ALUMINUM_VERSION ${PACKAGE_VERSION}) 10 | 11 | set(_AL_KNOWN_COMPONENTS 12 | MPI 13 | NCCL 14 | HOST_TRANSFER 15 | MPI_CUDA) 16 | 17 | set(AL_HAS_CALIPER @AL_HAS_CALIPER@) 18 | set(AL_HAS_CUDA @AL_HAS_CUDA@) 19 | set(AL_HAS_ROCM @AL_HAS_ROCM@) 20 | set(AL_HAS_MPI_CUDA @AL_HAS_MPI_CUDA@) 21 | set(AL_HAS_HOST_TRANSFER @AL_HAS_HOST_TRANSFER@) 22 | set(AL_HAS_NCCL @AL_HAS_NCCL@) 23 | set(AL_HAS_ROCTRACER @AL_HAS_ROCTRACER@) 24 | 25 | set(MPI_CXX_COMPILER "@MPI_CXX_COMPILER@" CACHE FILEPATH 26 | "The MPI CXX compiler wrapper.") 27 | find_package(MPI 3.0 REQUIRED COMPONENTS CXX) 28 | 29 | set(AL_USE_HWLOC @AL_USE_HWLOC@) 30 | if (AL_USE_HWLOC) 31 | find_dependency(HWLOC) 32 | endif () 33 | find_dependency(Threads) 34 | 35 | if (AL_HAS_CALIPER) 36 | find_dependency(caliper) 37 | endif () 38 | 39 | if (AL_HAS_ROCM) 40 | # The API for Aluminum does not require that HIP language support 41 | # be enabled; it only requires the host/device libraries be found. 42 | find_dependency(hip) 43 | find_dependency(hipcub) 44 | find_dependency(rocm_smi) 45 | 46 | set(AL_ROCM_PATH "@AL_ROCM_PATH@") 47 | 48 | if (AL_HAS_NCCL) 49 | find_dependency(rccl) 50 | endif (AL_HAS_NCCL) 51 | 52 | if (AL_HAS_ROCTRACER) 53 | find_dependency(Roctracer COMPONENTS roctx) 54 | endif () 55 | elseif (AL_HAS_CUDA) 56 | 57 | # The API for Aluminum does not require that CUDA language support 58 | # be enabled; it only requires the CUDAToolkit. 59 | find_dependency(CUDAToolkit) 60 | 61 | if (AL_HAS_NCCL) 62 | find_dependency(NCCL) 63 | endif (AL_HAS_NCCL) 64 | 65 | set(AL_HAS_NVPROF @AL_HAS_NVPROF@) 66 | if (AL_HAS_NVPROF) 67 | set(AL_HAS_EXTERNAL_NVTX @AL_HAS_EXTERNAL_NVTX@) 68 | if (AL_HAS_EXTERNAL_NVTX) 69 | find_dependency(NVTX) 70 | endif () 71 | endif () 72 | 73 | # Because of their templated nature, the CUB-based memory allocator 74 | # is public. Therefore, this dependency is public and must be met 75 | # downstream. 76 | set(AL_HAS_EXTERNAL_CUB @AL_HAS_EXTERNAL_CUB@) 77 | if (AL_HAS_EXTERNAL_CUB) 78 | find_dependency(CUB) 79 | endif () 80 | endif (AL_HAS_ROCM) 81 | 82 | @PACKAGE_INIT@ 83 | 84 | set(_TMP_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIRS@") 85 | foreach (_DIR ${_TMP_INCLUDE_DIRS}) 86 | set_and_check(_INCLUDE_DIR "${_DIR}") 87 | list(APPEND ALUMINUM_INCLUDE_DIRS "${_INCLUDE_DIR}") 88 | endforeach (_DIR "${_TMP_INCLUDE_DIRS}") 89 | 90 | set(_TMP_LIBRARY_DIRS "@PACKAGE_LIB_INSTALL_DIR@") 91 | foreach (_DIR ${_TMP_LIBRARY_DIRS}) 92 | set_and_check(_LIBRARY_DIR "${_DIR}") 93 | list(APPEND ALUMINUM_LIBRARY_DIRS "${_LIBRARY_DIR}") 94 | endforeach (_DIR ${_TMP_LIBRARY_DIRS}) 95 | 96 | if (NOT TARGET AL::Al) 97 | include(${CMAKE_CURRENT_LIST_DIR}/AluminumTargets.cmake) 98 | endif () 99 | 100 | foreach (comp ${_AL_KNOWN_COMPONENTS}) 101 | if (AL_HAS_${comp}) 102 | set(Aluminum_${comp}_FOUND 1) 103 | endif () 104 | endforeach () 105 | 106 | check_required_components(Aluminum) 107 | 108 | set(Aluminum_LIBRARIES AL::Al) 109 | -------------------------------------------------------------------------------- /src/cuda/cuda.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/cuda/cuda.hpp" 29 | 30 | #include 31 | #include 32 | 33 | #include "aluminum/cuda/sync_memory.hpp" 34 | #include "aluminum/cuda/events.hpp" 35 | #include "aluminum/cuda/streams.hpp" 36 | #include "aluminum/utils/locked_resource_pool.hpp" 37 | #include "aluminum/tuning_params.hpp" 38 | 39 | namespace Al { 40 | namespace internal { 41 | namespace cuda { 42 | 43 | // Define resource pools. 44 | Al::internal::LockedResourcePool sync_pool; 45 | Al::internal::LockedResourcePool event_pool; 46 | 47 | namespace { 48 | // Whether stream memory operations are supported. 49 | bool stream_mem_ops_supported = false; 50 | } 51 | 52 | void init(int&, char**&) { 53 | // Initialize internal streams. 54 | stream_pool.allocate(AL_CUDA_STREAM_POOL_SIZE); 55 | // Check whether stream memory operations are supported. 56 | int attr = 0; 57 | #if defined AL_HAS_ROCM 58 | int dev; 59 | AL_CHECK_CUDA(hipGetDevice(&dev)); 60 | AL_CHECK_CUDA(hipDeviceGetAttribute( 61 | &attr, hipDeviceAttributeCanUseStreamWaitValue, dev)); 62 | #elif defined AL_HAS_CUDA 63 | // There was an API change to these in CUDA 11.7, and the flag to check 64 | // for support changed (to have _V1) in CUDA 12. But as of CUDA 12, 65 | // these are enabled by default, so we do not need to check. 66 | #if CUDA_VERSION >= 12000 67 | attr = 1; 68 | #else 69 | CUdevice dev; 70 | AL_CHECK_CUDA_DRV(cuCtxGetDevice(&dev)); 71 | AL_CHECK_CUDA_DRV(cuDeviceGetAttribute( 72 | &attr, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev)); 73 | #endif 74 | #endif 75 | stream_mem_ops_supported = attr; 76 | // Preallocate memory for synchronization operations. 77 | sync_pool.preallocate(AL_SYNC_MEM_PREALLOC); 78 | } 79 | 80 | void finalize() { 81 | sync_pool.clear(); 82 | event_pool.clear(); 83 | stream_pool.clear(); 84 | } 85 | 86 | bool stream_memory_operations_supported() { 87 | return stream_mem_ops_supported; 88 | } 89 | 90 | } // namespace cuda 91 | } // namespace internal 92 | } // namespace Al 93 | -------------------------------------------------------------------------------- /benchmark/benchmark_events.cpp: -------------------------------------------------------------------------------- 1 | /** Benchmark different event implementations. */ 2 | 3 | #include 4 | 5 | #include "Al.hpp" 6 | #include "benchmark_utils.hpp" 7 | #include "wait.hpp" 8 | 9 | #if defined AL_HAS_ROCM 10 | #include 11 | #elif defined AL_HAS_CUDA 12 | #include 13 | #endif 14 | 15 | class Event { 16 | public: 17 | virtual void record(AlGpuStream_t stream) = 0; 18 | virtual bool query() = 0; 19 | }; 20 | 21 | class CudaEvent : public Event { 22 | public: 23 | CudaEvent() { 24 | AL_CHECK_CUDA( 25 | AlGpuEventCreateWithFlags(&event, AlGpuNoTimingEventFlags)); 26 | } 27 | ~CudaEvent() { 28 | AL_IGNORE_NODISCARD(AlGpuEventDestroy(event)); 29 | } 30 | void record(AlGpuStream_t stream) override { 31 | AL_CHECK_CUDA(AlGpuEventRecord(event, stream)); 32 | } 33 | bool query() override { 34 | return AlGpuEventQuery(event) == AlGpuSuccess; 35 | } 36 | private: 37 | AlGpuEvent_t event; 38 | }; 39 | 40 | class CustomEvent : public Event { 41 | public: 42 | CustomEvent() { 43 | AL_CHECK_CUDA(AlGpuMallocHost(&event, sizeof(int32_t))); 44 | __atomic_store_n(event, 1, __ATOMIC_SEQ_CST); 45 | #if defined AL_HAS_ROCM 46 | AL_CHECK_CUDA(hipHostGetDevicePointer(&dev_ptr, event, 0)); 47 | #elif defined AL_HAS_CUDA 48 | AL_CHECK_CUDA_DRV(cuMemHostGetDevicePointer(&dev_ptr, event, 0)); 49 | #endif 50 | } 51 | ~CustomEvent() { 52 | AL_IGNORE_NODISCARD(AlGpuFreeHost(event)); 53 | } 54 | void record(AlGpuStream_t stream) override { 55 | __atomic_store_n(event, 0, __ATOMIC_SEQ_CST); 56 | #if defined AL_HAS_ROCM 57 | AL_CHECK_CUDA( 58 | hipStreamWriteValue32(stream, dev_ptr, 1, 0)); 59 | #elif defined AL_HAS_CUDA 60 | AL_CHECK_CUDA_DRV( 61 | cuStreamWriteValue32(stream, dev_ptr, 1, CU_STREAM_WRITE_VALUE_DEFAULT)); 62 | #endif 63 | } 64 | bool query() override { 65 | return __atomic_load_n(event, __ATOMIC_SEQ_CST); 66 | } 67 | private: 68 | int32_t* event __attribute__((aligned(64))); 69 | #if defined AL_HAS_ROCM 70 | hipDeviceptr_t dev_ptr; 71 | #elif defined AL_HAS_CUDA 72 | CUdeviceptr dev_ptr; 73 | #endif 74 | }; 75 | 76 | void do_benchmark(AlGpuStream_t stream, Event& event) { 77 | const double wait_time = 0.0001; 78 | std::vector times, launch_times; 79 | for (int i = 0; i < 100000; ++i) { 80 | double launch_start = Al::get_time(); 81 | gpu_wait(wait_time, stream); 82 | event.record(stream); 83 | double start = Al::get_time(); 84 | while (!event.query()) {} 85 | double end = Al::get_time(); 86 | launch_times.push_back(start - launch_start); 87 | times.push_back(end - start); 88 | AL_CHECK_CUDA(AlGpuStreamSynchronize(stream)); 89 | } 90 | std::cout << "Launch: " << SummaryStats(launch_times) << std::endl; 91 | std::cout << "Query: " << SummaryStats(times) << std::endl; 92 | } 93 | 94 | int main(int, char**) { 95 | AL_CHECK_CUDA(AlGpuSetDevice(0)); 96 | AlGpuStream_t stream; 97 | AL_CHECK_CUDA(AlGpuStreamCreate(&stream)); 98 | { 99 | CudaEvent cuda_event; 100 | CustomEvent custom_event; 101 | std::cout << "Custom event:" << std::endl; 102 | do_benchmark(stream, custom_event); 103 | std::cout << "CUDA Event:" << std::endl; 104 | do_benchmark(stream, cuda_event); 105 | } 106 | AL_CHECK_CUDA(AlGpuStreamSynchronize(stream)); 107 | AL_CHECK_CUDA(AlGpuStreamDestroy(stream)); 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /include/aluminum/ht/allreduce.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | template 39 | class AllreduceAlState : public HostTransferCollectiveSignalAtEndState { 40 | public: 41 | AllreduceAlState(const T* sendbuf, T* recvbuf, size_t count_, 42 | ReductionOperator op_, HostTransferCommunicator& comm_, 43 | AlGpuStream_t stream_) : 44 | HostTransferCollectiveSignalAtEndState(stream_), 45 | host_mem(mempool.allocate(count_)), 46 | count(count_), 47 | op(mpi::ReductionOperator2MPI_Op(op_)), 48 | comm(comm_.get_comm()) { 49 | // Transfer data from device to host. 50 | if (sendbuf != recvbuf) { 51 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count, 52 | AlGpuMemcpyDeviceToHost, stream_)); 53 | } else { 54 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, recvbuf, sizeof(T)*count, 55 | AlGpuMemcpyDeviceToHost, stream_)); 56 | } 57 | start_event.record(stream_); 58 | 59 | // Have the device wait on the host. 60 | gpu_wait.wait(stream_); 61 | 62 | // Transfer completed buffer back to device. 63 | AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count, 64 | AlGpuMemcpyHostToDevice, stream_)); 65 | end_event.record(stream_); 66 | } 67 | 68 | ~AllreduceAlState() { 69 | mempool.release(host_mem); 70 | } 71 | 72 | std::string get_name() const override { return "HTAllreduce"; } 73 | 74 | protected: 75 | void start_mpi_op() override { 76 | AL_MPI_LARGE_COUNT_CALL(MPI_Iallreduce)( 77 | MPI_IN_PLACE, host_mem, count, mpi::TypeMap(), 78 | op, comm, get_mpi_req()); 79 | } 80 | 81 | private: 82 | T* host_mem; 83 | size_t count; 84 | MPI_Op op; 85 | MPI_Comm comm; 86 | }; 87 | 88 | } // namespace ht 89 | } // namespace internal 90 | } // namespace Al 91 | -------------------------------------------------------------------------------- /include/aluminum/mpi/reduce_scatter.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | template 40 | void passthrough_reduce_scatter(const T* sendbuf, T* recvbuf, size_t count, 41 | ReductionOperator op, MPICommunicator& comm) { 42 | AL_MPI_LARGE_COUNT_CALL(MPI_Reduce_scatter_block)( 43 | buf_or_inplace(sendbuf), recvbuf, count, 44 | TypeMap(), ReductionOperator2MPI_Op(op), 45 | comm.get_comm()); 46 | } 47 | 48 | template 49 | class ReduceScatterAlState : public MPIState { 50 | public: 51 | ReduceScatterAlState(const T* sendbuf_, T* recvbuf_, size_t count_, 52 | ReductionOperator op_, MPICommunicator& comm_, 53 | AlMPIReq req_) : 54 | MPIState(req_), 55 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), 56 | op(ReductionOperator2MPI_Op(op_)), 57 | comm(comm_.get_comm()) {} 58 | 59 | ~ReduceScatterAlState() override {} 60 | 61 | std::string get_name() const override { return "MPIReduceScatter"; } 62 | 63 | protected: 64 | void start_mpi_op() override { 65 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter_block)( 66 | buf_or_inplace(sendbuf), recvbuf, count, 67 | TypeMap(), op, comm, get_mpi_req()); 68 | } 69 | 70 | private: 71 | const T* sendbuf; 72 | T* recvbuf; 73 | size_t count; 74 | MPI_Op op; 75 | MPI_Comm comm; 76 | }; 77 | 78 | template 79 | void passthrough_nb_reduce_scatter(const T* sendbuf, T* recvbuf, size_t count, 80 | ReductionOperator op, MPICommunicator& comm, 81 | AlMPIReq& req) { 82 | req = get_free_request(); 83 | internal::mpi::ReduceScatterAlState* state = 84 | new internal::mpi::ReduceScatterAlState( 85 | sendbuf, recvbuf, count, op, comm, req); 86 | get_progress_engine()->enqueue(state); 87 | } 88 | 89 | } // namespace mpi 90 | } // namespace internal 91 | } // namespace Al 92 | -------------------------------------------------------------------------------- /include/aluminum/ht/reduce.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | template 39 | class ReduceAlState : public HostTransferCollectiveSignalNonRootEarlyState { 40 | public: 41 | ReduceAlState(const T* sendbuf, T* recvbuf, size_t count_, ReductionOperator op_, 42 | int root_, HostTransferCommunicator& comm_, AlGpuStream_t stream_) : 43 | HostTransferCollectiveSignalNonRootEarlyState(comm_.rank() == root_, stream_), 44 | host_mem(mempool.allocate(count_)), 45 | count(count_), 46 | root(root_), 47 | op(mpi::ReductionOperator2MPI_Op(op_)), 48 | comm(comm_.get_comm()) { 49 | // Transfer data from device to host. 50 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count, 51 | AlGpuMemcpyDeviceToHost, stream_)); 52 | start_event.record(stream_); 53 | 54 | // Have the device wait on the host. 55 | gpu_wait.wait(stream_); 56 | 57 | if (is_root) { 58 | // Transfer completed buffer back to device. 59 | AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count, 60 | AlGpuMemcpyHostToDevice, stream_)); 61 | } 62 | end_event.record(stream_); 63 | } 64 | 65 | ~ReduceAlState() override { 66 | mempool.release(host_mem); 67 | } 68 | 69 | std::string get_name() const override { return "HTReduce"; } 70 | 71 | protected: 72 | void start_mpi_op() override { 73 | if (is_root) { 74 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce)( 75 | MPI_IN_PLACE, host_mem, count, mpi::TypeMap(), 76 | op, root, comm, get_mpi_req()); 77 | } else { 78 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce)( 79 | host_mem, host_mem, count, mpi::TypeMap(), 80 | op, root, comm, get_mpi_req()); 81 | } 82 | } 83 | 84 | private: 85 | T* host_mem; 86 | size_t count; 87 | int root; 88 | MPI_Op op; 89 | MPI_Comm comm; 90 | }; 91 | 92 | } // namespace ht 93 | } // namespace internal 94 | } // namespace Al 95 | -------------------------------------------------------------------------------- /benchmark/benchmark_waits.cpp: -------------------------------------------------------------------------------- 1 | /** Benchmark different wait implementations. */ 2 | 3 | #include 4 | #include "Al.hpp" 5 | #include "aluminum/cuda/helper_kernels.hpp" 6 | #include "benchmark_utils.hpp" 7 | #include "wait.hpp" 8 | 9 | #if defined AL_HAS_ROCM 10 | #include 11 | #elif defined AL_HAS_CUDA 12 | #include 13 | #endif 14 | 15 | class Wait { 16 | public: 17 | Wait() { 18 | AL_CHECK_CUDA(AlGpuMallocHost(&wait_sync, sizeof(int32_t))); 19 | __atomic_store_n(wait_sync, 0, __ATOMIC_SEQ_CST); 20 | } 21 | ~Wait() { 22 | AL_IGNORE_NODISCARD(AlGpuFreeHost(wait_sync)); 23 | } 24 | virtual void wait(AlGpuStream_t stream) = 0; 25 | virtual void signal() { 26 | __atomic_store_n(wait_sync, 1, __ATOMIC_SEQ_CST); 27 | } 28 | 29 | int32_t* wait_sync __attribute__((aligned(64))); 30 | }; 31 | 32 | #if defined AL_HAS_ROCM 33 | class StreamOpWait : public Wait { 34 | public: 35 | StreamOpWait() : Wait() { 36 | AL_CHECK_CUDA(hipHostGetDevicePointer(&dev_ptr, wait_sync, 0)); 37 | } 38 | ~StreamOpWait() {} 39 | void wait(AlGpuStream_t stream) override { 40 | Al::internal::cuda::launch_wait_kernel(stream, 1, dev_ptr); 41 | } 42 | hipDeviceptr_t dev_ptr; 43 | }; 44 | #elif defined AL_HAS_CUDA 45 | class StreamOpWait : public Wait { 46 | public: 47 | StreamOpWait() : Wait() { 48 | AL_CHECK_CUDA_DRV(cuMemHostGetDevicePointer(&dev_ptr, wait_sync, 0)); 49 | } 50 | ~StreamOpWait() {} 51 | void wait(AlGpuStream_t stream) override { 52 | Al::internal::cuda::launch_wait_kernel(stream, 1, dev_ptr); 53 | } 54 | CUdeviceptr dev_ptr; 55 | }; 56 | #endif 57 | 58 | class KernelWait : public Wait { 59 | public: 60 | KernelWait() : Wait() { 61 | AL_CHECK_CUDA(AlGpuHostGetDevicePointer( 62 | reinterpret_cast(&dev_ptr), wait_sync, 0)); 63 | } 64 | ~KernelWait() {} 65 | void wait(AlGpuStream_t stream) override { 66 | Al::internal::cuda::launch_wait_kernel(stream, 1, dev_ptr); 67 | } 68 | int32_t* dev_ptr __attribute__((aligned(64))); 69 | }; 70 | 71 | void do_benchmark(AlGpuStream_t stream, Wait& wait) { 72 | AlGpuEvent_t e; 73 | AL_CHECK_CUDA(AlGpuEventCreateWithFlags(&e, AlGpuNoTimingEventFlags)); 74 | std::vector times, launch_times; 75 | for (int i = 0; i < 100000; ++i) { 76 | double launch_start = Al::get_time(); 77 | wait.wait(stream); 78 | double launch_end = Al::get_time(); 79 | AL_CHECK_CUDA(AlGpuEventRecord(e, stream)); 80 | double start = Al::get_time(); 81 | wait.signal(); 82 | while (AlGpuEventQuery(e) == AlGpuErrorNotReady) {} 83 | double end = Al::get_time(); 84 | launch_times.push_back(launch_end - launch_start); 85 | times.push_back(end - start); 86 | AL_CHECK_CUDA(AlGpuStreamSynchronize(stream)); 87 | } 88 | std::cout << "Launch: " << SummaryStats(launch_times) << std::endl; 89 | std::cout << "Signal: " << SummaryStats(times) << std::endl; 90 | AL_CHECK_CUDA(AlGpuEventDestroy(e)); 91 | } 92 | 93 | int main(int, char**) { 94 | AL_CHECK_CUDA(AlGpuSetDevice(0)); 95 | AlGpuStream_t stream; 96 | AL_CHECK_CUDA(AlGpuStreamCreate(&stream)); 97 | { 98 | StreamOpWait stream_op_wait; 99 | KernelWait kernel_wait; 100 | std::cout << "StreamOp wait:" << std::endl; 101 | do_benchmark(stream, stream_op_wait); 102 | std::cout << "Kernel wait:" << std::endl; 103 | do_benchmark(stream, kernel_wait); 104 | } 105 | AL_CHECK_CUDA(AlGpuStreamSynchronize(stream)); 106 | AL_CHECK_CUDA(AlGpuStreamDestroy(stream)); 107 | return 0; 108 | } 109 | -------------------------------------------------------------------------------- /include/aluminum/ht/reduce_scatterv.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | #include // std::accumulate 35 | 36 | namespace Al { 37 | namespace internal { 38 | namespace ht { 39 | 40 | template 41 | class ReduceScattervAlState : public HostTransferCollectiveSignalAtEndState { 42 | public: 43 | ReduceScattervAlState(const T* sendbuf, T* recvbuf, 44 | std::vector counts_, 45 | ReductionOperator op_, HostTransferCommunicator& comm_, 46 | AlGpuStream_t stream_) : 47 | HostTransferCollectiveSignalAtEndState(stream_), 48 | total_size(std::accumulate(counts_.begin(), counts_.end(), size_t{0})), 49 | host_mem(mempool.allocate(total_size)), 50 | counts(mpi::countify_size_t_vector(counts_)), 51 | op(mpi::ReductionOperator2MPI_Op(op_)), 52 | comm(comm_.get_comm()) { 53 | // Transfer data from device to host. 54 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*total_size, 55 | AlGpuMemcpyDeviceToHost, stream_)); 56 | start_event.record(stream_); 57 | 58 | // Have the device wait on the host. 59 | gpu_wait.wait(stream_); 60 | 61 | // Transfer completed buffer back to device. 62 | AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, 63 | sizeof(T)*counts_[comm_.rank()], 64 | AlGpuMemcpyHostToDevice, stream_)); 65 | end_event.record(stream_); 66 | } 67 | 68 | ~ReduceScattervAlState() override { 69 | mempool.release(host_mem); 70 | } 71 | 72 | std::string get_name() const override { return "HTReduceScatterv"; } 73 | 74 | protected: 75 | void start_mpi_op() override { 76 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter)( 77 | MPI_IN_PLACE, host_mem, counts.data(), 78 | mpi::TypeMap(), op, comm, get_mpi_req()); 79 | } 80 | 81 | private: 82 | size_t total_size; 83 | T* host_mem; 84 | mpi::Al_mpi_count_vector_t counts; 85 | MPI_Op op; 86 | MPI_Comm comm; 87 | }; 88 | 89 | } // namespace ht 90 | } // namespace internal 91 | } // namespace Al 92 | -------------------------------------------------------------------------------- /include/aluminum/mpi/gather.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | // Data is passed in recvbuf on non-root processes when in-place. 40 | template 41 | void passthrough_gather(const T* sendbuf, T* recvbuf, size_t count, int root, 42 | MPICommunicator& comm) { 43 | if (sendbuf == IN_PLACE() && comm.rank() != root) { 44 | sendbuf = recvbuf; 45 | } 46 | AL_MPI_LARGE_COUNT_CALL(MPI_Gather)( 47 | buf_or_inplace(sendbuf), count, TypeMap(), 48 | recvbuf, count, TypeMap(), root, comm.get_comm()); 49 | } 50 | 51 | template 52 | class GatherAlState : public MPIState { 53 | public: 54 | GatherAlState(const T* sendbuf_, T* recvbuf_, size_t count_, int root_, 55 | MPICommunicator& comm_, AlMPIReq req_) : 56 | MPIState(req_), 57 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), root(root_), 58 | comm(comm_.get_comm()), rank(comm_.rank()) {} 59 | 60 | ~GatherAlState() override {} 61 | 62 | std::string get_name() const override { return "MPIGather"; } 63 | 64 | protected: 65 | void start_mpi_op() override { 66 | if (sendbuf == IN_PLACE() && rank != root) { 67 | sendbuf = recvbuf; 68 | } 69 | AL_MPI_LARGE_COUNT_CALL(MPI_Igather)( 70 | buf_or_inplace(sendbuf), count, TypeMap(), 71 | recvbuf, count, TypeMap(), root, comm, get_mpi_req()); 72 | } 73 | 74 | private: 75 | const T* sendbuf; 76 | T* recvbuf; 77 | size_t count; 78 | int root; 79 | MPI_Comm comm; 80 | int rank; 81 | }; 82 | 83 | // Data is passed in recvbuf on non-root processes when in-place. 84 | template 85 | void passthrough_nb_gather(const T* sendbuf, T* recvbuf, size_t count, int root, 86 | MPICommunicator& comm, AlMPIReq& req) { 87 | req = get_free_request(); 88 | internal::mpi::GatherAlState* state = 89 | new internal::mpi::GatherAlState( 90 | sendbuf, recvbuf, count, root, comm, req); 91 | get_progress_engine()->enqueue(state); 92 | } 93 | 94 | } // namespace mpi 95 | } // namespace internal 96 | } // namespace Al 97 | -------------------------------------------------------------------------------- /include/aluminum/ht/allgather.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/ht/communicator.hpp" 32 | #include "aluminum/ht/base_state.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace ht { 37 | 38 | template 39 | class AllgatherAlState : public HostTransferCollectiveSignalAtEndState { 40 | public: 41 | AllgatherAlState(const T* sendbuf, T* recvbuf, size_t count_, 42 | HostTransferCommunicator& comm_, AlGpuStream_t stream_) : 43 | HostTransferCollectiveSignalAtEndState(stream_), 44 | host_mem(mempool.allocate(comm_.size()*count_)), 45 | count(count_), 46 | comm(comm_.get_comm()) { 47 | // Transfer data from device to host. 48 | if (sendbuf == recvbuf) { 49 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem + comm_.rank()*count, 50 | sendbuf + comm_.rank()*count, 51 | sizeof(T)*count, AlGpuMemcpyDeviceToHost, 52 | stream_)); 53 | } else { 54 | AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem + comm_.rank()*count, 55 | sendbuf, sizeof(T)*count, 56 | AlGpuMemcpyDeviceToHost, stream_)); 57 | } 58 | start_event.record(stream_); 59 | 60 | // Have the device wait on the host. 61 | gpu_wait.wait(stream_); 62 | 63 | // Transfer completed buffer back to device. 64 | AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, 65 | sizeof(T)*count_*comm_.size(), 66 | AlGpuMemcpyHostToDevice, stream_)); 67 | end_event.record(stream_); 68 | } 69 | 70 | ~AllgatherAlState() override { 71 | mempool.release(host_mem); 72 | } 73 | 74 | std::string get_name() const override { return "HTAllgather"; } 75 | 76 | protected: 77 | void start_mpi_op() override { 78 | AL_MPI_LARGE_COUNT_CALL(MPI_Iallgather)( 79 | MPI_IN_PLACE, count, mpi::TypeMap(), 80 | host_mem, count, mpi::TypeMap(), comm, get_mpi_req()); 81 | } 82 | 83 | private: 84 | T* host_mem; 85 | size_t count; 86 | MPI_Comm comm; 87 | }; 88 | 89 | } // namespace ht 90 | } // namespace internal 91 | } // namespace Al 92 | -------------------------------------------------------------------------------- /include/aluminum/cuda/streams.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #if defined AL_HAS_ROCM 39 | #include 40 | #elif defined AL_HAS_CUDA 41 | #include 42 | #endif 43 | 44 | namespace Al { 45 | namespace internal { 46 | namespace cuda { 47 | 48 | /** 49 | * Manages a set of CUDA streams, accessed in round-robin order. 50 | * 51 | * Streams are either default priority or high priority. 52 | * 53 | * It is safe for multiple threads to call get_stream concurrently. 54 | */ 55 | class StreamPool { 56 | public: 57 | /** Create pool with num_streams default and high priority streams. */ 58 | StreamPool(size_t num_streams = 0); 59 | ~StreamPool(); 60 | 61 | /** Explicitly allocate streams. */ 62 | void allocate(size_t num_streams); 63 | 64 | /** Delete all streams in the pool. */ 65 | void clear(); 66 | 67 | /** Return a default-priority CUDA stream. */ 68 | AlGpuStream_t get_stream(); 69 | 70 | /** 71 | * Return a high-priority CUDA stream. 72 | * 73 | * If high-priority streams are not supported, returns a default-priority 74 | * stream. 75 | */ 76 | AlGpuStream_t get_high_priority_stream(); 77 | 78 | /** 79 | * Replace all streams in the pool with streams from an external source. 80 | * 81 | * Streams provided this way will not be freed by Aluminum. 82 | * 83 | * @param stream_getter Return the next stream to use in the pool. This 84 | * may be called an arbitrary number of times. It takes a boolean argument 85 | * for whether to return a default (false) or high (true) priority stream. 86 | */ 87 | void replace_streams(std::function stream_getter); 88 | 89 | private: 90 | std::vector default_streams; 91 | std::atomic default_idx{0}; 92 | std::vector high_priority_streams; 93 | std::atomic high_priority_idx{0}; 94 | /** Whether streams were replaced; we do not free these streams. */ 95 | bool external_streams = false; 96 | }; 97 | 98 | /** Default internal stream pool for Aluminum. */ 99 | extern StreamPool stream_pool; 100 | 101 | } // namespace cuda 102 | } // namespace internal 103 | } // namespace Al 104 | -------------------------------------------------------------------------------- /src/trace.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include 29 | #include "aluminum/trace.hpp" 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | #ifdef AL_THREAD_MULTIPLE 37 | #include 38 | #endif 39 | 40 | #include "aluminum/state.hpp" 41 | 42 | namespace Al { 43 | namespace internal { 44 | namespace trace { 45 | 46 | namespace { 47 | #ifdef AL_THREAD_MULTIPLE 48 | std::mutex log_mutex; 49 | #endif 50 | std::vector trace_log; 51 | std::vector pe_trace_log; 52 | } 53 | 54 | void save_trace_entry(std::string entry, bool progress) { 55 | #ifdef AL_THREAD_MULTIPLE 56 | std::lock_guard lock(log_mutex); 57 | #endif 58 | if (progress) { 59 | pe_trace_log.push_back(std::move(entry)); 60 | } else { 61 | trace_log.push_back(std::move(entry)); 62 | } 63 | } 64 | 65 | void record_pe_start(const AlState& state) { 66 | #ifdef AL_TRACE 67 | std::stringstream ss; 68 | ss << get_time() << ": PE START " 69 | << state.get_name() << " " 70 | << state.get_desc(); 71 | save_trace_entry(ss.str(), true); 72 | #else 73 | (void) state; 74 | #endif 75 | } 76 | 77 | void record_pe_done(const AlState& state) { 78 | #ifdef AL_TRACE 79 | std::stringstream ss; 80 | ss << get_time() << ": PE DONE " 81 | << state.get_name() << " " 82 | << state.get_desc(); 83 | save_trace_entry(ss.str(), true); 84 | #else 85 | (void) state; 86 | #endif 87 | } 88 | 89 | std::ostream& write_trace_log(std::ostream& os) { 90 | #ifdef AL_TRACE 91 | #ifdef AL_THREAD_MULTIPLE 92 | std::lock_guard lock(log_mutex); 93 | #endif 94 | os << "Trace:\n"; 95 | for (const auto& entry : trace_log) os << entry << "\n"; 96 | os << "Progress engine trace:\n"; 97 | for (const auto& entry : pe_trace_log) os << entry << "\n"; 98 | return os; 99 | #else 100 | return os; 101 | #endif 102 | } 103 | 104 | void write_trace_to_file() { 105 | #ifdef AL_TRACE 106 | char hostname[HOST_NAME_MAX]; 107 | gethostname(hostname, HOST_NAME_MAX); 108 | pid_t pid = getpid(); 109 | std::string filename = std::string(hostname) + "." + std::to_string(pid) 110 | + ".trace.txt"; 111 | std::ofstream trace_file(filename); 112 | write_trace_log(trace_file); 113 | #endif 114 | } 115 | 116 | } // namespace trace 117 | } // namespace internal 118 | } // namespace Al 119 | -------------------------------------------------------------------------------- /include/aluminum/utils/locked_resource_pool.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | namespace Al { 37 | namespace internal { 38 | 39 | /** 40 | * Provides thread-safe access to a set of identical resources. 41 | * 42 | * These resources could be things like a fixed-size chunk of memory. 43 | * 44 | * ResourceAllocator must provide allocate and deallocate methods to 45 | * create and destroy (resp.) the managed resource (T). 46 | * 47 | * LockedResourcePool will guarantee that each instance of the 48 | * ResourceAllocator will be accessed by only a single thread at a 49 | * time. If additional locking is necessary for correctness, the 50 | * ResourceAllocator must provide it. 51 | */ 52 | template 53 | class LockedResourcePool { 54 | public: 55 | /** Initialize the resource pool. */ 56 | LockedResourcePool(){}; 57 | 58 | ~LockedResourcePool() { 59 | clear(); 60 | } 61 | 62 | /** Preallocate this many instances of the resource. */ 63 | void preallocate(size_t prealloc) { 64 | for (size_t i = 0; i < prealloc; ++i) { 65 | resources.push_back(allocator.allocate()); 66 | } 67 | } 68 | 69 | /** Get one instance of the resource. */ 70 | T get() { 71 | std::lock_guard lg(lock); 72 | if (resources.empty()) { 73 | return allocator.allocate(); 74 | } else { 75 | T resource = resources.back(); 76 | resources.pop_back(); 77 | return resource; 78 | } 79 | } 80 | 81 | /** Return an instance of the resource to the pool. */ 82 | void release(T resource) { 83 | std::lock_guard lg(lock); 84 | resources.push_back(resource); 85 | } 86 | 87 | /** Clear all instances left in the pool. */ 88 | void clear() { 89 | std::lock_guard lg(lock); 90 | for (auto&& resource : resources) { 91 | allocator.deallocate(resource); 92 | } 93 | resources.clear(); 94 | } 95 | 96 | private: 97 | /** Protects access to allocator and resource. */ 98 | std::mutex lock; 99 | /** Allocator for the resource. */ 100 | ResourceAllocator allocator; 101 | /** Currently available resources. */ 102 | std::vector resources; 103 | }; 104 | 105 | } // namespace internal 106 | } // namespace Al 107 | -------------------------------------------------------------------------------- /include/aluminum/mpi/communicator.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include "aluminum/mpi_comm_and_stream_wrapper.hpp" 32 | 33 | namespace Al { 34 | namespace internal { 35 | namespace mpi { 36 | 37 | int get_max_tag(); 38 | 39 | // Define the tag that point-to-point operations will use. 40 | constexpr int pt2pt_tag = 2; 41 | 42 | // Forward-declare. 43 | class MPICommunicator; 44 | 45 | /** Retrieve Aluminum's world MPI communicator. */ 46 | const MPICommunicator& get_world_comm(); 47 | 48 | /** Communicator for MPI-based operations. */ 49 | class MPICommunicator : public MPICommAndStreamWrapper { 50 | public: 51 | /** Default constructor; use Aluminum's world. */ 52 | MPICommunicator() : MPICommunicator(get_world_comm().get_comm()) {} 53 | /** 54 | * Use a particular MPI communicator and stream. 55 | * 56 | * The MPI backend currently ignores streams. 57 | */ 58 | MPICommunicator(MPI_Comm comm_, int = 0) : 59 | MPICommAndStreamWrapper(comm_, 0) {} 60 | /** Cannot copy this. */ 61 | MPICommunicator(const MPICommunicator& other) = delete; 62 | /** Default move constructor. */ 63 | MPICommunicator(MPICommunicator&& other) = default; 64 | /** Cannot copy this. */ 65 | MPICommunicator& operator=(MPICommunicator& other) = delete; 66 | /** Default move assignment operator. */ 67 | MPICommunicator& operator=(MPICommunicator&& other) = default; 68 | ~MPICommunicator() {} 69 | 70 | /** Create a new MPICommunicator with the same processes. */ 71 | MPICommunicator copy(int stream = 0) const { 72 | return MPICommunicator(get_comm(), stream); 73 | } 74 | 75 | /** 76 | * Return the next free tag on this communicator. 77 | * 78 | * TODO: This is meant for internal use and should be moved / eliminted. 79 | */ 80 | int get_free_tag() { 81 | int tag = free_tag++; 82 | if (free_tag >= internal::mpi::get_max_tag() 83 | || free_tag < starting_free_tag) { 84 | free_tag = starting_free_tag; 85 | } 86 | return tag; 87 | } 88 | 89 | private: 90 | /** 91 | * Starting tag to use for non-blocking operations. 92 | * No other operations should use any tag >= to this one. 93 | */ 94 | static constexpr int starting_free_tag = 10; 95 | /** Free tag for communication. */ 96 | int free_tag = starting_free_tag; 97 | }; 98 | 99 | } // namespace mpi 100 | } // namespace internal 101 | } // namespace Al 102 | -------------------------------------------------------------------------------- /include/aluminum/mpi/scatter.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | // Data is passed in recvbuf on root processes when in-place. 40 | template 41 | void passthrough_scatter(const T* sendbuf, T* recvbuf, size_t count, int root, 42 | MPICommunicator& comm) { 43 | if (sendbuf == IN_PLACE() && comm.rank() == root) { 44 | sendbuf = recvbuf; 45 | recvbuf = IN_PLACE(); 46 | } 47 | AL_MPI_LARGE_COUNT_CALL(MPI_Scatter)( 48 | sendbuf, count, TypeMap(), 49 | buf_or_inplace(recvbuf), count, TypeMap(), 50 | root, comm.get_comm()); 51 | } 52 | 53 | template 54 | class ScatterAlState : public MPIState { 55 | public: 56 | ScatterAlState(const T* sendbuf_, T* recvbuf_, size_t count_, int root_, 57 | MPICommunicator& comm_, AlMPIReq req_) : 58 | MPIState(req_), 59 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), root(root_), 60 | comm(comm_.get_comm()), rank(comm_.rank()) {} 61 | 62 | ~ScatterAlState() override {} 63 | 64 | std::string get_name() const override { return "MPIScatter"; } 65 | 66 | protected: 67 | void start_mpi_op() override { 68 | if (sendbuf == IN_PLACE() && rank == root) { 69 | sendbuf = recvbuf; 70 | recvbuf = IN_PLACE(); 71 | } 72 | AL_MPI_LARGE_COUNT_CALL(MPI_Iscatter)( 73 | sendbuf, count, TypeMap(), 74 | buf_or_inplace(recvbuf), count, TypeMap(), 75 | root, comm, get_mpi_req()); 76 | } 77 | 78 | private: 79 | const T* sendbuf; 80 | T* recvbuf; 81 | size_t count; 82 | int root; 83 | MPI_Comm comm; 84 | int rank; 85 | }; 86 | 87 | // When in-place, it is recvbuf that uses IN_PLACE. 88 | template 89 | void passthrough_nb_scatter(const T* sendbuf, T* recvbuf, size_t count, 90 | int root, MPICommunicator& comm, AlMPIReq& req) { 91 | req = get_free_request(); 92 | internal::mpi::ScatterAlState* state = 93 | new internal::mpi::ScatterAlState( 94 | sendbuf, recvbuf, count, root, comm, req); 95 | get_progress_engine()->enqueue(state); 96 | } 97 | 98 | } // namespace mpi 99 | } // namespace internal 100 | } // namespace Al 101 | -------------------------------------------------------------------------------- /cmake/tuning_params.hpp.in: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | /** 29 | * These are used to tune various algorithmic choices. 30 | * You should probably choose them based on benchmarks for your particular 31 | * configuration. 32 | */ 33 | #pragma once 34 | 35 | /** Number of concurrent operations the progress engine will perform. */ 36 | #define AL_PE_NUM_CONCURRENT_OPS @AL_PE_NUM_CONCURRENT_OPS@ 37 | /** Max number of streams the progress engine supports. */ 38 | #define AL_PE_NUM_STREAMS @AL_PE_NUM_STREAMS@ 39 | /** Max number of pipeline stages the progress engine supports. */ 40 | #define AL_PE_NUM_PIPELINE_STAGES @AL_PE_NUM_PIPELINE_STAGES@ 41 | /** Max number of entries in each stream's input queue. */ 42 | #define AL_PE_INPUT_QUEUE_SIZE @AL_PE_INPUT_QUEUE_SIZE@ 43 | /** 44 | * Whether to have a default stream entry for the progress engine 45 | * added automatically. 46 | * 47 | * This makes sense when using MPI, but not so when using the 48 | * host-transfer backend, which does not use the default stream. 49 | */ 50 | #cmakedefine AL_PE_ADD_DEFAULT_STREAM 51 | /** 52 | * Whether to use a thread-local cache to map streams to input queues 53 | * for the progress engine. 54 | * 55 | * If you expect to have only a small number of streams, using a cache 56 | * is unlikely to help, since searching it will take as long as 57 | * searching the actual list. 58 | */ 59 | #cmakedefine AL_PE_STREAM_QUEUE_CACHE 60 | 61 | /** 62 | * Whether to delay starting the progress engine until it is actually 63 | * needed. This results in a one-time penalty on the first call to an 64 | * operation that uses the progress engine, but only a quick check 65 | * thereafter. 66 | */ 67 | #cmakedefine AL_PE_START_ON_DEMAND 68 | 69 | /** Amount of sync object memory to preallocate in the pool. */ 70 | #define AL_SYNC_MEM_PREALLOC @AL_SYNC_MEM_PREALLOC@ 71 | 72 | /** 73 | * Cache line size in bytes. 74 | * 75 | * On x86 this is usually 64. On POWER this is 128. On A64FX this is 256. 76 | */ 77 | #define AL_CACHE_LINE_SIZE @AL_CACHE_LINE_SIZE@ 78 | 79 | /** 80 | * Minimum size in bytes to avoid destructive interference. 81 | * 82 | * This is generally AL_CACHE_LINE_SIZE, except on x86, where it should 83 | * be twice the cache line size, because Intel processors can fetch 84 | * two adjacent cache lines (see Intel Optimization Manual, 3.7.3). 85 | */ 86 | #define AL_DESTRUCTIVE_INTERFERENCE_SIZE @AL_DESTRUCTIVE_INTERFERENCE_SIZE@ 87 | 88 | /** Number of CUDA streams in the default stream pool. */ 89 | #define AL_CUDA_STREAM_POOL_SIZE @AL_CUDA_STREAM_POOL_SIZE@ 90 | -------------------------------------------------------------------------------- /include/aluminum/base.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | /** HOST_NAME_MAX is a linux only define */ 36 | #ifndef HOST_NAME_MAX 37 | # if defined(_POSIX_HOST_NAME_MAX) 38 | # define HOST_NAME_MAX _POSIX_HOST_NAME_MAX 39 | # elif defined(MAXHOSTNAMELEN) 40 | # define HOST_NAME_MAX MAXHOSTNAMELEN 41 | # endif 42 | #endif /* HOST_NAME_MAX */ 43 | 44 | /** Intentionally ignore results of [[nodiscard]] functions. */ 45 | #define AL_IGNORE_NODISCARD(fcall) static_cast((fcall)) 46 | 47 | namespace Al { 48 | 49 | /** 50 | * Base Aluminum exception class. 51 | */ 52 | class al_exception : public std::exception { 53 | public: 54 | al_exception(const std::string m, const std::string f, const int l) : 55 | msg(m), file(f), line(l) { 56 | err = file + ":" + std::to_string(line) + " - " + msg; 57 | } 58 | const char* what() const noexcept override { 59 | return err.c_str(); 60 | } 61 | private: 62 | /** Exception message. */ 63 | const std::string msg; 64 | /** File exception occurred in. */ 65 | const std::string file; 66 | /** Line exception occurred at. */ 67 | const int line; 68 | /** Constructed error message. */ 69 | std::string err; 70 | }; 71 | 72 | /** 73 | * Construct a single string from concatenating all arguments. 74 | * 75 | * Arguments must support operator<<. 76 | */ 77 | template 78 | std::string build_string(Args&&... args) { 79 | std::ostringstream oss; 80 | (oss << ... << args); 81 | return oss.str(); 82 | } 83 | 84 | /** Throw an Aluminum excpetion. */ 85 | #define throw_al_exception(...) \ 86 | throw Al::al_exception(Al::build_string(__VA_ARGS__), __FILE__, __LINE__) 87 | 88 | /** 89 | * Output an error and then terminate Aluminum. 90 | * 91 | * This is primarily useful for handling errors in destructors. 92 | */ 93 | #define terminate_al(...) \ 94 | do { \ 95 | std::cerr << __FILE__ << ":" \ 96 | << __LINE__ << " - " \ 97 | << Al::build_string(__VA_ARGS__) \ 98 | << std::endl; \ 99 | std::terminate(); \ 100 | } while (0) 101 | 102 | /** Predefined reduction operations. */ 103 | enum class ReductionOperator { 104 | sum, prod, min, max, lor, land, lxor, bor, band, bxor, avg 105 | }; 106 | 107 | } // namespace Al 108 | -------------------------------------------------------------------------------- /include/aluminum/mpi/reduce.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | // Data is passed on recvbuf on non-root processes when in-place. 40 | template 41 | void passthrough_reduce(const T* sendbuf, T* recvbuf, size_t count, 42 | ReductionOperator op, int root, MPICommunicator& comm) { 43 | if (sendbuf == IN_PLACE() && comm.rank() != root) { 44 | sendbuf = recvbuf; 45 | } 46 | AL_MPI_LARGE_COUNT_CALL(MPI_Reduce)( 47 | buf_or_inplace(sendbuf), recvbuf, count, TypeMap(), 48 | ReductionOperator2MPI_Op(op), root, comm.get_comm()); 49 | } 50 | 51 | template 52 | class ReduceAlState : public MPIState { 53 | public: 54 | ReduceAlState(const T* sendbuf_, T* recvbuf_, size_t count_, ReductionOperator op_, 55 | int root_, MPICommunicator& comm_, AlMPIReq req_) : 56 | MPIState(req_), 57 | sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), 58 | op(ReductionOperator2MPI_Op(op_)), root(root_), 59 | comm(comm_.get_comm()), rank(comm_.rank()) {} 60 | 61 | ~ReduceAlState() override {} 62 | 63 | std::string get_name() const override { return "MPIReduce"; } 64 | 65 | protected: 66 | void start_mpi_op() override { 67 | if (sendbuf == IN_PLACE() && rank != root) { 68 | sendbuf = recvbuf; 69 | } 70 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce)( 71 | buf_or_inplace(sendbuf), recvbuf, count, TypeMap(), 72 | op, root, comm, get_mpi_req()); 73 | } 74 | 75 | private: 76 | const T* sendbuf; 77 | T* recvbuf; 78 | size_t count; 79 | MPI_Op op; 80 | int root; 81 | MPI_Comm comm; 82 | int rank; 83 | }; 84 | 85 | // Data is passed in recvbuf on non-root processes when in-place. 86 | template 87 | void passthrough_nb_reduce(const T* sendbuf, T* recvbuf, size_t count, 88 | ReductionOperator op, int root, 89 | MPICommunicator& comm, AlMPIReq& req) { 90 | req = get_free_request(); 91 | internal::mpi::ReduceAlState* state = 92 | new internal::mpi::ReduceAlState( 93 | sendbuf, recvbuf, count, op, root, comm, req); 94 | get_progress_engine()->enqueue(state); 95 | } 96 | 97 | } // namespace mpi 98 | } // namespace internal 99 | } // namespace Al 100 | -------------------------------------------------------------------------------- /src/cuda/gpu_status_flag.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #include "aluminum/cuda/gpu_status_flag.hpp" 29 | 30 | #include "aluminum/cuda/cuda.hpp" 31 | #include "aluminum/cuda/sync_memory.hpp" 32 | #include "aluminum/cuda/events.hpp" 33 | 34 | namespace Al { 35 | namespace internal { 36 | namespace cuda { 37 | 38 | GPUStatusFlag::GPUStatusFlag() { 39 | if (stream_memory_operations_supported()) { 40 | stream_mem.sync_event = sync_pool.get(); 41 | // Initialize to completed to match CUDA event semantics. 42 | __atomic_store_n(stream_mem.sync_event, 1, __ATOMIC_SEQ_CST); 43 | #if defined AL_HAS_ROCM 44 | AL_CHECK_CUDA(hipHostGetDevicePointer( 45 | &stream_mem.sync_event_dev_ptr, 46 | stream_mem.sync_event, 0)); 47 | #elif defined AL_HAS_CUDA 48 | AL_CHECK_CUDA_DRV(cuMemHostGetDevicePointer( 49 | &stream_mem.sync_event_dev_ptr, 50 | stream_mem.sync_event, 0)); 51 | #endif 52 | } else { 53 | plain_event = event_pool.get(); 54 | } 55 | } 56 | 57 | GPUStatusFlag::~GPUStatusFlag() { 58 | if (stream_memory_operations_supported()) { 59 | sync_pool.release(stream_mem.sync_event); 60 | } else { 61 | event_pool.release(plain_event); 62 | } 63 | } 64 | 65 | void GPUStatusFlag::record(AlGpuStream_t stream) { 66 | if (stream_memory_operations_supported()) { 67 | // We cannot use std::atomic because we need the actual address of 68 | // the memory. 69 | __atomic_store_n(stream_mem.sync_event, 0, __ATOMIC_SEQ_CST); 70 | #if defined AL_HAS_ROCM 71 | AL_CHECK_CUDA(hipStreamWriteValue32( 72 | stream, stream_mem.sync_event_dev_ptr, 1, 73 | 0)); 74 | #elif defined AL_HAS_CUDA 75 | AL_CHECK_CUDA_DRV(cuStreamWriteValue32( 76 | stream, stream_mem.sync_event_dev_ptr, 1, 77 | CU_STREAM_WRITE_VALUE_DEFAULT)); 78 | #endif 79 | } else { 80 | AL_CHECK_CUDA(AlGpuEventRecord(plain_event, stream)); 81 | } 82 | } 83 | 84 | bool GPUStatusFlag::query() { 85 | if (stream_memory_operations_supported()) { 86 | return __atomic_load_n(stream_mem.sync_event, __ATOMIC_SEQ_CST); 87 | } else { 88 | auto r = AlGpuEventQuery(plain_event); 89 | if (r == AlGpuSuccess) { 90 | return true; 91 | } else if (r != AlGpuErrorNotReady) { 92 | AL_CHECK_CUDA(r); 93 | return false; // Never reached. 94 | } else { 95 | return false; 96 | } 97 | } 98 | } 99 | 100 | } // namespace cuda 101 | } // namespace internal 102 | } // namespace Al 103 | -------------------------------------------------------------------------------- /include/aluminum/mpi/reduce_scatterv.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include "aluminum/progress.hpp" 31 | #include "aluminum/mpi/base_state.hpp" 32 | #include "aluminum/mpi/communicator.hpp" 33 | #include "aluminum/mpi/utils.hpp" 34 | 35 | namespace Al { 36 | namespace internal { 37 | namespace mpi { 38 | 39 | template 40 | void passthrough_reduce_scatterv(const T* sendbuf, T* recvbuf, 41 | std::vector counts, 42 | ReductionOperator op, 43 | MPICommunicator& comm) { 44 | auto counts_ = countify_size_t_vector(counts); 45 | AL_MPI_LARGE_COUNT_CALL(MPI_Reduce_scatter)( 46 | buf_or_inplace(sendbuf), recvbuf, 47 | counts_.data(), TypeMap(), 48 | ReductionOperator2MPI_Op(op), comm.get_comm()); 49 | } 50 | 51 | template 52 | class ReduceScattervAlState : public MPIState { 53 | public: 54 | ReduceScattervAlState(const T* sendbuf_, T* recvbuf_, 55 | std::vector counts_, 56 | ReductionOperator op_, MPICommunicator& comm_, 57 | AlMPIReq req_) : 58 | MPIState(req_), 59 | sendbuf(sendbuf_), recvbuf(recvbuf_), 60 | counts(countify_size_t_vector(counts_)), 61 | op(ReductionOperator2MPI_Op(op_)), 62 | comm(comm_.get_comm()) {} 63 | 64 | ~ReduceScattervAlState() override {} 65 | 66 | std::string get_name() const override { return "MPIReduceScatterv"; } 67 | 68 | protected: 69 | void start_mpi_op() override { 70 | AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter)( 71 | buf_or_inplace(sendbuf), recvbuf, 72 | counts.data(), TypeMap(), op, comm, 73 | get_mpi_req()); 74 | } 75 | 76 | private: 77 | const T* sendbuf; 78 | T* recvbuf; 79 | Al_mpi_count_vector_t counts; 80 | MPI_Op op; 81 | MPI_Comm comm; 82 | }; 83 | 84 | template 85 | void passthrough_nb_reduce_scatterv(const T* sendbuf, T* recvbuf, 86 | std::vector counts, 87 | ReductionOperator op, 88 | MPICommunicator& comm, 89 | AlMPIReq& req) { 90 | req = get_free_request(); 91 | internal::mpi::ReduceScattervAlState* state = 92 | new internal::mpi::ReduceScattervAlState( 93 | sendbuf, recvbuf, counts, op, comm, req); 94 | get_progress_engine()->enqueue(state); 95 | } 96 | 97 | } // namespace mpi 98 | } // namespace internal 99 | } // namespace Al 100 | -------------------------------------------------------------------------------- /include/aluminum/trace.hpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the 3 | // Lawrence Livermore National Laboratory in collaboration with University of 4 | // Illinois Urbana-Champaign. 5 | // 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in 7 | // the CONTRIBUTORS file. 8 | // 9 | // LLNL-CODE-756777. 10 | // All rights reserved. 11 | // 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. 14 | // 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you 16 | // may not use this file except in compliance with the License. You may 17 | // obtain a copy of the License at: 18 | // 19 | // http://www.apache.org/licenses/LICENSE-2.0 20 | // 21 | // Unless required by applicable law or agreed to in writing, software 22 | // distributed under the License is distributed on an "AS IS" BASIS, 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | // implied. See the License for the specific language governing 25 | // permissions and limitations under the license. 26 | //////////////////////////////////////////////////////////////////////////////// 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include 38 | #include "aluminum/utils/utils.hpp" 39 | 40 | namespace Al { 41 | namespace internal { 42 | // Forward declaration. 43 | class AlState; 44 | namespace trace { 45 | 46 | #ifdef AL_TRACE 47 | // Need to be able to print vectors. 48 | template 49 | std::ostream& operator<<(std::ostream& os, const std::vector& v) { 50 | os << "["; 51 | if (!v.empty()) { 52 | for (size_t i = 0; i < v.size() - 1; ++i) { 53 | os << v[i] << ", "; 54 | } 55 | os << v[v.size() - 1]; 56 | } 57 | os << "]"; 58 | return os; 59 | } 60 | #endif 61 | 62 | /** 63 | * Save entry to the trace log. 64 | * progress is whether this comes from the progress engine, which is recorded 65 | * separately. 66 | */ 67 | void save_trace_entry(std::string entry, bool progress = false); 68 | 69 | /** Record an operation to the trace log. */ 70 | template 71 | #ifdef AL_TRACE 72 | void record_op(std::string const& op, 73 | typename Backend::comm_type const& comm, 74 | Args&&... args) { 75 | std::ostringstream ss; 76 | ss << static_cast(get_time()) << ": " 77 | << Backend::Name() << " " 78 | << comm.get_stream() << " " 79 | << typeid(T).name() << " " 80 | << op << " " 81 | << comm.rank() << " " << comm.size() << " "; 82 | 83 | // See: 84 | // https://stackoverflow.com/questions/27375089/what-is-the-easiest-way-to-print-a-variadic-parameter-pack-using-stdostream 85 | using expander = int[]; 86 | (void) expander{0, (void(ss << " " << std::forward(args)), 0)...}; 87 | save_trace_entry(ss.str(), false); 88 | } 89 | #else // AL_TRACE 90 | void record_op(std::string const&, 91 | typename Backend::comm_type const&, 92 | Args&&...) { 93 | } 94 | #endif // AL_TRACE 95 | 96 | /** Record a progress engine operation start to the trace log. */ 97 | void record_pe_start(const AlState& state); 98 | /** Record a progress engine operation completion to the trace log. */ 99 | void record_pe_done(const AlState& state); 100 | 101 | /** Write trace logs to os. */ 102 | std::ostream& write_trace_log(std::ostream& os); 103 | /** Write trace logs to hostname.pid.trace.txt. */ 104 | void write_trace_to_file(); 105 | 106 | } // namespace trace 107 | } // namespace internal 108 | } // namespace Al 109 | --------------------------------------------------------------------------------