├── docs
    ├── favicon.ico
    ├── requirements.txt
    ├── api.rst
    ├── Makefile
    ├── make.bat
    ├── start.rst
    └── conf.py
├── .gitmodules
├── include
    └── aluminum
    │   ├── traits
    │       ├── CMakeLists.txt
    │       └── traits.hpp
    │   ├── cuda
    │       ├── CMakeLists.txt
    │       ├── helper_kernels.hpp
    │       ├── events.hpp
    │       ├── sync_memory.hpp
    │       ├── gpu_wait.hpp
    │       ├── cuda_mempool.hpp
    │       ├── gpu_status_flag.hpp
    │       └── streams.hpp
    │   ├── utils
    │       ├── CMakeLists.txt
    │       ├── utils.hpp
    │       └── locked_resource_pool.hpp
    │   ├── mpi_cuda
    │       ├── CMakeLists.txt
    │       ├── rma_null.hpp
    │       ├── communicator.hpp
    │       └── rma_self.hpp
    │   ├── ht
    │       ├── CMakeLists.txt
    │       ├── barrier.hpp
    │       ├── communicator.hpp
    │       ├── alltoall.hpp
    │       ├── bcast.hpp
    │       ├── reduce_scatter.hpp
    │       ├── allreduce.hpp
    │       ├── reduce.hpp
    │       ├── reduce_scatterv.hpp
    │       └── allgather.hpp
    │   ├── mpi
    │       ├── CMakeLists.txt
    │       ├── barrier.hpp
    │       ├── base_state.hpp
    │       ├── bcast.hpp
    │       ├── alltoall.hpp
    │       ├── allgather.hpp
    │       ├── allreduce.hpp
    │       ├── reduce_scatter.hpp
    │       ├── gather.hpp
    │       ├── communicator.hpp
    │       ├── scatter.hpp
    │       ├── reduce.hpp
    │       └── reduce_scatterv.hpp
    │   ├── CMakeLists.txt
    │   ├── internal.hpp
    │   ├── datatypes.hpp
    │   ├── profiling.hpp
    │   ├── base.hpp
    │   └── trace.hpp
├── CONTRIBUTORS
├── util
    ├── CMakeLists.txt
    └── al_info.cpp
├── src
    ├── mpi_cuda
    │   ├── CMakeLists.txt
    │   └── communicator.cpp
    ├── cuda
    │   ├── CMakeLists.txt
    │   ├── helper_kernels.cu
    │   ├── gpu_wait.cpp
    │   ├── cuda.cpp
    │   └── gpu_status_flag.cpp
    ├── mempool.cpp
    ├── mpi_cuda_impl.cpp
    ├── ht_impl.cpp
    ├── profiling.cpp
    └── trace.cpp
├── .readthedocs.yaml
├── .gitignore
├── examples
    ├── CMakeLists.txt
    └── README.md
├── cmake
    ├── FindNVTX.cmake
    ├── FindCUB.cmake
    ├── FindHWLOC.cmake
    ├── FindNCCL.cmake
    ├── FindRoctracer.cmake
    ├── AluminumConfig.cmake.in
    └── tuning_params.hpp.in
├── LICENSE
├── CITATION.cff
├── al.svg
├── test
    ├── test_utils_mpi.hpp
    ├── test_stream_mem_ops.cpp
    ├── CMakeLists.txt
    ├── test_utils_mpi_cuda.hpp
    ├── test_utils_nccl.hpp
    └── test_utils_ht.hpp
├── benchmark
    ├── wait.hpp
    ├── benchmark_utils_mpi.hpp
    ├── CMakeLists.txt
    ├── benchmark_utils_nccl.hpp
    ├── benchmark_utils_ht.hpp
    ├── wait.cu
    ├── benchmark_events.cpp
    └── benchmark_waits.cpp
└── README.md


/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llnl/Aluminum/HEAD/docs/favicon.ico


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==6.1.3
2 | sphinx-rtd-theme==1.2.0
3 | breathe==4.35.0
4 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | Aluminum API Documentation
2 | ==========================
3 | 
4 | .. doxygenfile:: Al.hpp
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/cxxopts"]
2 | 	path = third_party/cxxopts
3 | 	url = https://github.com/jarro2783/cxxopts.git
4 | 


--------------------------------------------------------------------------------
/include/aluminum/traits/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   ht_traits.hpp
 3 |   mpi_traits.hpp
 4 |   nccl_traits.hpp
 5 |   traits_base.hpp
 6 |   traits.hpp
 7 | )
 8 | 
 9 | set(THIS_DIR_HEADERS_TO_INSTALL ${THIS_DIR_HEADERS})
10 | 
11 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE)
12 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   cuda.hpp
 3 |   cuda_mempool.hpp
 4 |   events.hpp
 5 |   gpu_status_flag.hpp
 6 |   gpu_wait.hpp
 7 |   helper_kernels.hpp
 8 |   streams.hpp
 9 |   sync_memory.hpp
10 |   )
11 | 
12 | set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
13 | 


--------------------------------------------------------------------------------
/include/aluminum/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   caching_allocator.hpp
 3 |   locked_resource_pool.hpp
 4 |   meta.hpp
 5 |   mpsc_queue.hpp
 6 |   spsc_queue.hpp
 7 |   utils.hpp
 8 |   )
 9 | 
10 | set(THIS_DIR_HEADERS_TO_INSTALL ${THIS_DIR_HEADERS})
11 | 
12 | # Propagate the files up the tree
13 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE)
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
 1 | LLNL Team
 2 |   Nikoli Dryden <dryden1@llnl.gov>
 3 |   Naoya Maruyama <maruyama3@llnl.gov>
 4 |   Tim Moon <moon13@llnl.gov>
 5 |   Tom Benson <benson31@llnl.gov>
 6 |   Andy Yoo <yoo2@llnl.gov>
 7 |   Brian Van Essen <vanessen1@llnl.gov>
 8 |   Corey McNeish <mcneish1@llnl.gov>
 9 | 
10 | UIUC Team
11 |   Nikoli Dryden <dryden2@illinois.edu>
12 |   Marc Snir <snir@illinois.edu>
13 | 


--------------------------------------------------------------------------------
/util/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This is a simple target. Just needs Al_config.hpp in its include path.
 2 | add_executable(al_info al_info.cpp)
 3 | target_link_libraries(al_info PRIVATE Al)
 4 | install(
 5 |   TARGETS al_info
 6 |   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 7 |   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 8 |   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
 9 |   INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
10 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi_cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   communicator.hpp
 3 |   util.hpp
 4 |   )
 5 | 
 6 | set_source_path(THIS_DIR_CUDA_RMA_HEADERS
 7 |   rma.hpp
 8 |   rma_ipc.hpp
 9 |   rma_null.hpp
10 |   rma_self.hpp
11 |   )
12 | 
13 | if (AL_HAS_MPI_CUDA_RMA)
14 |   list(APPEND THIS_DIR_HEADERS "${THIS_DIR_CUDA_RMA_HEADERS}")
15 | endif ()
16 | 
17 | # Propagate the files up the tree
18 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE)
19 | 


--------------------------------------------------------------------------------
/src/mpi_cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_CXX_SOURCES
 2 |   communicator.cpp
 3 |   )
 4 | 
 5 | set_source_path(THIS_DIR_CUDA_RMA_CXX_SOURCES
 6 |   rma.cpp
 7 |   )
 8 | 
 9 | if (AL_HAS_MPI_CUDA_RMA)
10 |   list(APPEND THIS_DIR_CXX_SOURCES "${THIS_DIR_CUDA_RMA_CXX_SOURCES}")
11 | endif ()
12 | 
13 | # Propagate the files up the tree
14 | set(CXX_SOURCES ${CXX_SOURCES} ${THIS_DIR_CXX_SOURCES} PARENT_SCOPE)
15 | set(CUDA_SOURCES ${CUDA_SOURCES} ${THIS_DIR_CUDA_SOURCES} PARENT_SCOPE)
16 | 


--------------------------------------------------------------------------------
/src/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_CUDA_CXX_SOURCES
 2 |   cuda.cpp
 3 |   gpu_status_flag.cpp
 4 |   gpu_wait.cpp
 5 |   streams.cpp
 6 |   )
 7 | set_source_path(THIS_DIR_CUDA_SOURCES
 8 |   helper_kernels.cu
 9 |   )
10 | 
11 | if (AL_HAS_CUDA)
12 |   list(APPEND THIS_DIR_CXX_SOURCES "${THIS_DIR_CUDA_CXX_SOURCES}")
13 | endif ()
14 | 
15 | set(CXX_SOURCES "${CXX_SOURCES}" "${THIS_DIR_CXX_SOURCES}" PARENT_SCOPE)
16 | set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CUDA_SOURCES}" PARENT_SCOPE)
17 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   allgather.hpp
 3 |   allgatherv.hpp
 4 |   allreduce.hpp
 5 |   alltoall.hpp
 6 |   alltoallv.hpp
 7 |   barrier.hpp
 8 |   base_state.hpp
 9 |   bcast.hpp
10 |   communicator.hpp
11 |   gather.hpp
12 |   gatherv.hpp
13 |   multisendrecv.hpp
14 |   reduce.hpp
15 |   reduce_scatter.hpp
16 |   reduce_scatterv.hpp
17 |   scatter.hpp
18 |   scatterv.hpp
19 |   pt2pt.hpp
20 |   )
21 | 
22 | # Propagate the files up the tree
23 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE)
24 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements.txt
22 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   allgather.hpp
 3 |   allgatherv.hpp
 4 |   allreduce.hpp
 5 |   alltoall.hpp
 6 |   alltoallv.hpp
 7 |   base_state.hpp
 8 |   barrier.hpp
 9 |   bcast.hpp
10 |   communicator.hpp
11 |   gather.hpp
12 |   gatherv.hpp
13 |   multisendrecv.hpp
14 |   reduce.hpp
15 |   reduce_scatter.hpp
16 |   reduce_scatterv.hpp
17 |   scatter.hpp
18 |   scatterv.hpp
19 |   pt2pt.hpp
20 |   utils.hpp
21 |   )
22 | 
23 | set(THIS_DIR_HEADERS_TO_INSTALL ${THIS_DIR_HEADERS})
24 | 
25 | # Propagate the files up the tree
26 | set(HEADERS ${HEADERS} ${THIS_DIR_HEADERS} PARENT_SCOPE)
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # Emacs stuff
35 | *~
36 | \#*\#
37 | 
38 | # Python stuff
39 | __pycache__/
40 | *.py[cod]
41 | *$py.class
42 | 
43 | # Dev stuff
44 | build*/
45 | compile_commands.json
46 | .ccls-root
47 | .ccls-cache
48 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.21)
 2 | 
 3 | project(AlExamples
 4 |   VERSION 0.0.1
 5 |   DESCRIPTION "Examples of how to use Aluminum"
 6 |   LANGUAGES CXX
 7 | )
 8 | 
 9 | if (NOT CMAKE_BUILD_TYPE)
10 |   set(CMAKE_BUILD_TYPE Release CACHE STRING "Configuration type" FORCE)
11 | endif ()
12 | 
13 | find_package(Aluminum 1.0.0 REQUIRED)
14 | 
15 | set(SOURCES
16 |   hello_world.cpp
17 |   allreduce.cpp
18 |   pingpong.cpp
19 | )
20 | 
21 | foreach (src ${SOURCES})
22 |   string(REPLACE ".cpp" "" _tmp_exe_name "${src}")
23 |   get_filename_component(_exe_name "${_tmp_exe_name}" NAME)
24 |   add_executable(${_exe_name} ${src})
25 |   target_link_libraries(${_exe_name} PUBLIC ${Aluminum_LIBRARIES})
26 | endforeach ()
27 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ![Al](../al.svg) Aluminum Examples
 2 | ==================================
 3 | 
 4 | These are simple examples of how to use Aluminum.
 5 | 
 6 | Current examples:
 7 | * `hello_world`: Initialize Aluminum and have each process print its rank.
 8 | * `allreduce`: Allreduce a buffer of data and print it afterward.
 9 | * `pingpong`: Ranks alternate sending and receiving data.
10 | 
11 | ## Building
12 | 
13 | You can build the examples as follows.
14 | This assumes Aluminum has already been installed.
15 | 
16 | ```
17 | mkdir build
18 | cd build
19 | cmake ..
20 | make
21 | ```
22 | 
23 | If CMake cannot find the Aluminum library automatically, pass `-D Aluminum_DIR=/path/to/Aluminum`.
24 | 
25 | You can edit the example codes to change certain parameters, like the Aluminum backend used.
26 | 


--------------------------------------------------------------------------------
/cmake/FindNVTX.cmake:
--------------------------------------------------------------------------------
 1 | # Sets the following variables
 2 | #
 3 | #   NVTX_FOUND
 4 | #   NVTX_LIBRARY
 5 | #
 6 | # Defines the following imported target:
 7 | #
 8 | #   cuda::nvtx
 9 | #
10 | 
11 | find_library(NVTX_LIBRARY nvToolsExt
12 |   HINTS ${NVTX_DIR} $ENV{NVTX_DIR}
13 |   ${CUDAToolkit_LIBRARY_DIR}
14 |   PATH_SUFFIXES lib64
15 |   DOC "The nvtx library."
16 |   NO_DEFAULT_PATH)
17 | find_library(NVTX_LIBRARY nvToolsExt)
18 | 
19 | include(FindPackageHandleStandardArgs)
20 | find_package_handle_standard_args(NVTX
21 |   DEFAULT_MSG NVTX_LIBRARY)
22 | 
23 | if (NOT TARGET cuda::nvtx)
24 | 
25 |   add_library(cuda::nvtx INTERFACE IMPORTED)
26 | 
27 |   set_property(TARGET cuda::nvtx PROPERTY
28 |     INTERFACE_INCLUDE_DIRECTORIES "${CUDA_INCLUDE_DIRS}")
29 | 
30 |   set_property(TARGET cuda::nvtx PROPERTY
31 |     INTERFACE_LINK_LIBRARIES "${NVTX_LIBRARY}")
32 | 
33 | endif (NOT TARGET cuda::nvtx)
34 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 2 | Lawrence Livermore National Laboratory in collaboration with University of
 3 | Illinois Urbana-Champaign.
 4 | 
 5 | Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 6 | the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 7 | 
 8 | LLNL-CODE-756777.
 9 | All rights reserved.
10 | 
11 | This file is part of Aluminum GPU-aware Communication Library. For details, see
12 | http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
13 | 
14 | Licensed under the Apache License, Version 2.0 (the "License"); you
15 | may not use this file except in compliance with the License.  You may
16 | obtain a copy of the License at:
17 | 
18 | http://www.apache.org/licenses/LICENSE-2.0
19 | 
20 | Unless required by applicable law or agreed to in writing, software
21 | distributed under the License is distributed on an "AS IS" BASIS,
22 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 | implied. See the License for the specific language governing
24 | permissions and limitations under the license.
25 | 
26 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | title: "Aluminum Communication Library"
 3 | message: "If you use Aluminum, please cite it as"
 4 | authors:
 5 |   - family-names: Dryden
 6 |     given-names: Nikoli
 7 |   - family-names: Maruyama
 8 |     given-names: Naoya
 9 |   - family-names: Moon
10 |     given-names: Tim
11 |   - family-names: Benson
12 |     given-names: Tom
13 |   - family-names: Yoo
14 |     given-names: Andy
15 |   - family-names: Van Essen
16 |     given-names: Brian
17 |   - family-names: McNeish
18 |     given-names: Corey
19 |   - family-names: Snir
20 |     given-names: Marc
21 | preferred-citation:
22 |   title: "Aluminum: An Asynchronous, GPU-Aware Communication Library Optimized for Large-Scale Training of Deep Neural Networks on HPC Systems"
23 |   year: "2018"
24 |   type: conference-paper
25 |   collection-title: "Proceedings of the Workshop on Machine Learning in HPC Environments"
26 |   authors:
27 |     - family-names: Dryden
28 |       given-names: Nikoli
29 |     - family-names: Maruyama
30 |       given-names: Naoya
31 |     - family-names: Moon
32 |       given-names: Tim
33 |     - family-names: Benson
34 |       given-names: Tom
35 |     - family-names: Yoo
36 |       given-names: Andy
37 |     - family-names: Snir
38 |       given-names: Marc
39 |     - family-names: Van Essen
40 |       given-names: Brian
41 | 


--------------------------------------------------------------------------------
/al.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?><svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 86.89 97.97" width="64px" height="64px"><defs><style>.cls-1{fill:#fff;}.cls-2{fill:#545454;}.cls-3{fill:#919191;}.cls-4{fill:#1e1e1e;}</style></defs><g><path class="cls-3" d="M86.6,70.19V27.78c0-2.35-1.25-4.52-3.29-5.69L46.59,.88c-2.03-1.17-4.54-1.17-6.58,0L3.29,22.08c-2.03,1.17-3.29,3.35-3.29,5.69v42.41c0,2.35,1.25,4.52,3.29,5.69l36.73,21.2c2.03,1.17,4.54,1.17,6.58,0l36.73-21.2c2.03-1.17,3.29-3.35,3.29-5.69Z"/><path class="cls-2" d="M.89,72.87l39.41,24.21c2.03,1.17,4.54,1.17,6.58,0l36.73-21.2c2.03-1.17,3.29-3.35,3.29-5.69V29.87c0-2.91-1-5-1-5"/><path class="cls-4" d="M86.89,70.87c0,2.35-1.54,3.83-3.58,5.01l-36.73,21.2c-2.03,1.17-4.54,1.17-6.58,0L3.29,75.88c-2.4-2.01-3.4-3.01-2.4-4.01l42-22.72,44,21.72Z"/></g><g><path class="cls-1" d="M10.79,68.11v-2l2.72-1.2c.66-.3,1.11-.79,1.36-1.47l10.85-27.79,3.15-6.37h3.01l13.47,34.24c.12,.28,.3,.57,.52,.87s.48,.49,.76,.6l2.83,1.12v2h-14.13v-2l4.48-1.55-9.55-24.45-1.04-3.6h-.45l-10.53,27.92,4.53,1.68v2H10.79Zm10.67-13.41l1.31-3.71h12.29l1.31,3.71h-14.91Z"/><path class="cls-1" d="M51.16,68.11v-1.95l4.03-1.07c.3-.09,.56-.24,.76-.44,.2-.2,.31-.49,.31-.87v-31.63c0-.39-.12-.69-.36-.91-.24-.21-.51-.38-.81-.51l-4.11-1.52v-1.89l10.59-.48V63.79c0,.37,.11,.66,.33,.87,.22,.21,.48,.35,.76,.44l3.89,1.07v1.95h-15.39Z"/></g></svg>
2 | 


--------------------------------------------------------------------------------
/test/test_utils_mpi.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 


--------------------------------------------------------------------------------
/docs/start.rst:
--------------------------------------------------------------------------------
 1 | Getting Started with Aluminum
 2 | =============================
 3 | 
 4 | Once you have :doc:`built <build>` Aluminum, you probably want to use it.
 5 | Aluminum is in many respects quite similar to MPI, so if you have experience with that, things should be familiar.
 6 | 
 7 | This is a simple "Hello, world" program that shows key aspects of Aluminum:
 8 | 
 9 | .. code-block:: c++
10 | 
11 |    #include <Al.hpp>
12 |    #include <iostream>
13 | 
14 |    int main(int argc, char** argv) {
15 |      // Initialize Aluminum.
16 |      // Must be called before any other Aluminum calls.
17 |      Al::Initialize(argc, argv);
18 | 
19 |      // Create a communicator with all processes.
20 |      typename Al::MPIBackend::comm_type comm;
21 | 
22 |      // Each process prints its rank and the communicator size.
23 |      std::cout << "Hello, world, from rank "
24 |                << comm.rank() << " of "
25 |                << comm.size() << std::endl;
26 | 
27 |      // Do a simple (in-place) allreduce.
28 |      int rank = comm.rank();
29 |      Al::Allreduce<MPIBackend>(&rank, 1, Al::ReductionOperator::sum, comm);
30 |      std::cout << "The sum of ranks is "
31 |                << rank << std::endl;
32 | 
33 |      // Clean up Aluminum.
34 |      Al::Finalize();
35 | 
36 |      return 0;
37 |    }
38 | 
39 | For additional examples and more detail (including accelerator backends), see the `Aluminum examples <https://github.com/LLNL/Aluminum/tree/master/examples>`_.
40 | 


--------------------------------------------------------------------------------
/src/mempool.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include "aluminum/mempool.hpp"
29 | 
30 | namespace Al {
31 | namespace internal {
32 | 
33 | MemoryPool mempool;
34 | 
35 | }  // namespace internal
36 | }  // namespace Al
37 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | import subprocess
 7 | import os
 8 | import os.path
 9 | 
10 | if not os.path.isdir('_static'):
11 |     os.makedirs('_static')
12 | 
13 | # Generate Doxygen docs.
14 | subprocess.run(['doxygen', 'Doxyfile.in'])
15 | 
16 | # -- Project information -----------------------------------------------------
17 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
18 | 
19 | project = 'Aluminum'
20 | copyright = '2018, Lawrence Livermore National Security'
21 | author = 'Lawrence Livermore National Laboratory'
22 | 
23 | # -- General configuration ---------------------------------------------------
24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
25 | 
26 | extensions = ['breathe']
27 | 
28 | templates_path = ['_templates']
29 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
30 | 
31 | rst_prolog = """
32 | .. |AlLogo| image:: ../al.svg
33 | """
34 | 
35 | # -- Options for HTML output -------------------------------------------------
36 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
37 | 
38 | html_theme = 'sphinx_rtd_theme'
39 | html_static_path = ['_static']
40 | html_favicon = 'favicon.ico'
41 | 
42 | # Breathe configuration
43 | 
44 | breathe_projects = {'Aluminum': '_doxyout/xml/'}
45 | breathe_default_project = 'Aluminum'
46 | 


--------------------------------------------------------------------------------
/cmake/FindCUB.cmake:
--------------------------------------------------------------------------------
 1 | #[=============[.rst
 2 | FindCUB
 3 | ==========
 4 | 
 5 | Finds the CUB library.
 6 | 
 7 | The following variables will be defined::
 8 | 
 9 |   CUB_FOUND          - True if the system has the CUB library.
10 |   CUB_INCLUDE_DIRS   - The include directory needed for CUB.
11 | 
12 | The following cache variable will be set and marked as "advanced"::
13 | 
14 |   CUB_INCLUDE_DIR - The include directory needed for CUB.
15 | 
16 | In addition, the :prop_tgt:`IMPORTED` target ``cuda::CUB`` will
17 | be created.
18 | 
19 | #]=============]
20 | 
21 | 
22 | find_path(CUB_INCLUDE_PATH cub/cub.cuh
23 |   HINTS ${CUB_DIR} $ENV{CUB_DIR}
24 |   ${CUDAToolkit_INCLUDE_DIRS}
25 |   PATH_SUFFIXES include
26 |   NO_DEFAULT_PATH
27 |   DOC "The CUB header directory."
28 |   )
29 | find_path(CUB_INCLUDE_PATH cub/cub.cuh)
30 | 
31 | set(CUB_INCLUDE_DIRS "${CUB_INCLUDE_PATH}")
32 | 
33 | # Standard handling of the package arguments
34 | include(FindPackageHandleStandardArgs)
35 | find_package_handle_standard_args(CUB
36 |   DEFAULT_MSG CUB_INCLUDE_PATH)
37 | 
38 | # Setup the imported target
39 | if (NOT TARGET cuda::CUB)
40 |   add_library(cuda::CUB INTERFACE IMPORTED)
41 | endif (NOT TARGET cuda::CUB)
42 | 
43 | # Set the include directories for the target
44 | if (CUB_INCLUDE_PATH)
45 |   set_property(TARGET cuda::CUB
46 |     PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CUB_INCLUDE_PATH})
47 | endif ()
48 | 
49 | #
50 | # Cleanup
51 | #
52 | 
53 | # Set the include directories
54 | mark_as_advanced(FORCE CUB_INCLUDE_PATH)
55 | 
56 | # Set the libraries
57 | set(CUB_LIBRARIES cuda::CUB)
58 | 


--------------------------------------------------------------------------------
/include/aluminum/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(THIS_DIR_HEADERS
 2 |   base.hpp
 3 |   datatypes.hpp
 4 |   internal.hpp
 5 |   mempool.hpp
 6 |   mpi_comm_and_stream_wrapper.hpp
 7 |   mpi_impl.hpp
 8 |   profiling.hpp
 9 |   progress.hpp
10 |   state.hpp
11 |   trace.hpp
12 |   )
13 | set_source_path(THIS_DIR_CUDA_HEADERS
14 |   cuda.hpp
15 |   helper_kernels.hpp
16 |   )
17 | set_source_path(THIS_DIR_MPI_CUDA_HEADERS
18 |   mpi_cuda_impl.hpp
19 |   )
20 | set_source_path(THIS_DIR_HOST_TRANSFER_HEADERS
21 |   ht_impl.hpp
22 |   )
23 | set_source_path(THIS_DIR_NCCL_HEADERS
24 |   nccl_impl.hpp
25 | )
26 | 
27 | add_subdirectory(utils)
28 | 
29 | add_subdirectory(mpi)
30 | 
31 | if (AL_HAS_CUDA)
32 |   add_subdirectory(cuda)
33 | endif ()
34 | 
35 | if (AL_HAS_MPI_CUDA)
36 |   list(APPEND THIS_DIR_HEADERS "${THIS_DIR_MPI_CUDA_HEADERS}")
37 | 
38 |   add_subdirectory(mpi_cuda)
39 | endif ()
40 | 
41 | if (AL_HAS_HOST_TRANSFER)
42 |   list(APPEND THIS_DIR_HEADERS "${THIS_DIR_HOST_TRANSFER_HEADERS}")
43 |   add_subdirectory(ht)
44 | endif ()
45 | 
46 | if (AL_HAS_NCCL)
47 |   list(APPEND THIS_DIR_HEADERS "${THIS_DIR_NCCL_HEADERS}")
48 | endif ()
49 | 
50 | add_subdirectory(traits)
51 | 
52 | set(ALUMINUM_HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
53 | 
54 | # Note (trb 07/27/2020): The way generated files work in CMake is a
55 | # bit tricky -- they only work properly if they're used in the same
56 | # scope that they are created. So I'm moving all the install logic to
57 | # "/src/CMakeLists.txt", including the header installation. This
58 | # doesn't really matter for CUDA, but it's a problem for HIP/ROCm.
59 | 


--------------------------------------------------------------------------------
/benchmark/wait.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <Al_config.hpp>
31 | 
32 | #if defined AL_HAS_ROCM
33 | #include <hip/hip_runtime.h>
34 | #elif defined AL_HAS_CUDA
35 | #include <cuda_runtime.h>
36 | #endif
37 | 
38 | /** Cause the stream to wait for length seconds. */
39 | void gpu_wait(double length, AlGpuStream_t stream);
40 | 


--------------------------------------------------------------------------------
/src/mpi_cuda_impl.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include "aluminum/mpi_cuda_impl.hpp"
29 | 
30 | namespace Al {
31 | 
32 | namespace internal {
33 | namespace mpi_cuda {
34 | 
35 | void init(int&, char**&) {
36 |   
37 | }
38 | 
39 | void finalize() {
40 |   
41 | }
42 | 
43 | }  // namespace mpi_cuda
44 | }  // namespace internal
45 | }  // namespace Al
46 | 


--------------------------------------------------------------------------------
/include/aluminum/traits/traits.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | /**
29 |  * @file
30 |  * Compile-time traits describing Aluminum communication options.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include <Al_config.hpp>
36 | 
37 | #include "aluminum/traits/mpi_traits.hpp"
38 | 
39 | #ifdef AL_HAS_NCCL
40 | #include "aluminum/traits/nccl_traits.hpp"
41 | #endif
42 | #ifdef AL_HAS_HOST_TRANSFER
43 | #include "aluminum/traits/ht_traits.hpp"
44 | #endif
45 | 


--------------------------------------------------------------------------------
/test/test_stream_mem_ops.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include <iostream>
29 | #include "Al.hpp"
30 | 
31 | int main(int argc, char** argv) {
32 |   AL_CHECK_CUDA(AlGpuSetDevice(0));
33 |   Al::Initialize(argc, argv);
34 |   if (Al::internal::cuda::stream_memory_operations_supported()) {
35 |     std::cout << "Supported!" << std::endl;
36 |   } else {
37 |     std::cout << "Not supported! :(" << std::endl;
38 |   }
39 |   Al::Finalize();
40 |   return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_utils_mpi.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "Al.hpp"
31 | #include "benchmark_utils.hpp"
32 | 
33 | 
34 | template <>
35 | struct Timer<Al::MPIBackend> {
36 |   void start_timer(typename Al::MPIBackend::comm_type&) {
37 |     start_time = Al::get_time();
38 |   }
39 | 
40 |   double end_timer(typename Al::MPIBackend::comm_type&) {
41 |     return Al::get_time() - start_time;
42 |   }
43 | 
44 |   double start_time = 0.0;
45 | };
46 | 


--------------------------------------------------------------------------------
/cmake/FindHWLOC.cmake:
--------------------------------------------------------------------------------
 1 | # Output variables
 2 | #
 3 | #   HWLOC_FOUND
 4 | #   HWLOC_LIBRARIES
 5 | #   HWLOC_INCLUDE_PATH
 6 | #
 7 | # Also creates an imported target HWLOC::hwloc
 8 | 
 9 | if (MPI_FOUND)
10 |   list(APPEND _TMP_MPI_LIBS "${MPI_C_LIBRARIES}" "${MPI_CXX_LIBRARIES}")
11 |   foreach (lib IN LISTS _TMP_MPI_LIBS)
12 |     get_filename_component(_TMP_MPI_LIB_DIR "${lib}" DIRECTORY)
13 |     list(APPEND _TMP_MPI_LIBRARY_DIRS ${_TMP_MPI_LIB_DIR})
14 |   endforeach ()
15 | 
16 |   if (_TMP_MPI_LIBRARY_DIRS)
17 |     list(REMOVE_DUPLICATES _TMP_MPI_LIBRARY_DIRS)
18 |   endif ()
19 | endif (MPI_FOUND)
20 | 
21 | # Find the library
22 | find_library(HWLOC_LIBRARY hwloc
23 |   HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${_TMP_MPI_LIBRARY_DIRS}
24 |   PATH_SUFFIXES lib64 lib
25 |   NO_DEFAULT_PATH)
26 | find_library(HWLOC_LIBRARY hwloc)
27 | 
28 | # Find the header
29 | find_path(HWLOC_INCLUDE_PATH hwloc.h
30 |   HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR}
31 |   ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}
32 |   PATH_SUFFIXES include
33 |   NO_DEFAULT_PATH)
34 | find_path(HWLOC_INCLUDE_PATH hwloc.h)
35 | 
36 | # Handle the find_package arguments
37 | include(FindPackageHandleStandardArgs)
38 | find_package_handle_standard_args(
39 |   HWLOC DEFAULT_MSG HWLOC_LIBRARY HWLOC_INCLUDE_PATH)
40 | 
41 | # Build the imported target
42 | if (NOT TARGET HWLOC::hwloc)
43 |   add_library(HWLOC::hwloc INTERFACE IMPORTED)
44 | endif()
45 | 
46 | set_property(TARGET HWLOC::hwloc
47 |   PROPERTY INTERFACE_LINK_LIBRARIES ${HWLOC_LIBRARY})
48 | 
49 | if (NOT "/usr/include" STREQUAL "${HWLOC_INCLUDE_PATH}")
50 |   set_property(TARGET HWLOC::hwloc
51 |     PROPERTY INTERFACE_INCLUDE_DIRECTORIES
52 |     ${HWLOC_INCLUDE_PATH})
53 | endif ()
54 | 
55 | # Set the last of the output variables
56 | set(HWLOC_LIBRARIES HWLOC::hwloc)
57 | 
58 | # Cleanup
59 | mark_as_advanced(FORCE HWLOC_INCLUDE_PATH)
60 | mark_as_advanced(FORCE HWLOC_LIBRARY)
61 | 


--------------------------------------------------------------------------------
/src/mpi_cuda/communicator.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include "Al.hpp"
29 | #include "aluminum/mpi_cuda/communicator.hpp"
30 | #ifdef AL_HAS_MPI_CUDA_RMA
31 | #include "aluminum/mpi_cuda/rma.hpp"
32 | #endif
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace mpi_cuda {
37 | 
38 | #ifdef AL_HAS_MPI_CUDA_RMA
39 | RMA &MPICUDACommunicator::get_rma() {
40 |   if (!m_rma)
41 |     m_rma = std::make_shared<RMA>(*this);
42 |   return *m_rma;
43 | }
44 | #endif
45 | 
46 | MPICUDACommunicator::~MPICUDACommunicator() {}
47 | 
48 | } // namespace mpi_cuda
49 | } // namespace internal
50 | } // namespace Al
51 | 


--------------------------------------------------------------------------------
/benchmark/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(AL_BENCHMARK_HEADERS
 2 |   benchmark_utils.hpp
 3 |   benchmark_utils_mpi.hpp
 4 |   benchmark_utils_nccl.hpp
 5 |   benchmark_utils_ht.hpp
 6 |   wait.hpp)
 7 | 
 8 | set_source_path(AL_BENCHMARK_SOURCES
 9 |   benchmark_ops.cpp
10 |   bandwidth.cpp)
11 | 
12 | if (AL_HAS_CUDA OR AL_HAS_ROCM)
13 |   set_source_path(AL_GPU_BENCHMARK_SOURCES
14 |     benchmark_waits.cpp
15 |     benchmark_events.cpp)
16 | 
17 |   set_source_path(AL_GPU_BENCHMARK_HELPER_SOURCES
18 |     wait.cu
19 |     wait.hpp)
20 | endif ()
21 | 
22 | foreach(src IN LISTS AL_BENCHMARK_SOURCES AL_GPU_BENCHMARK_SOURCES)
23 |   string(REPLACE ".cpp" "" _tmp_benchmark_exe_name "${src}")
24 |   get_filename_component(_benchmark_exe_name
25 |     "${_tmp_benchmark_exe_name}" NAME)
26 |   add_executable(${_benchmark_exe_name} ${src} ${AL_BENCHMARK_HEADERS})
27 | 
28 |   # Get the test headers
29 |   target_link_libraries(${_benchmark_exe_name}
30 |     PRIVATE Al aluminum_test_headers)
31 |   target_include_directories(
32 |     ${_benchmark_exe_name} SYSTEM PRIVATE
33 |     ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include)
34 | 
35 |   # FIXME: Hopefully this can be removed in a future version of ROCm.
36 |   if (AL_HAS_ROCM AND AL_BUILD_TYPE_UPPER MATCHES "DEBUG")
37 |     target_compile_options(${_benchmark_exe_name} PRIVATE "-O0")
38 |   endif ()
39 | endforeach()
40 | 
41 | # Handle the GPU-specific ones
42 | foreach (src ${AL_GPU_BENCHMARK_SOURCES})
43 |   string(REPLACE ".cpp" "" _tmp_benchmark_exe_name "${src}")
44 |   get_filename_component(_benchmark_exe_name
45 |     "${_tmp_benchmark_exe_name}" NAME)
46 |   target_sources(${_benchmark_exe_name}
47 |     PUBLIC "${AL_GPU_BENCHMARK_HELPER_SOURCES}")
48 | endforeach ()
49 | 
50 | # Get languages right
51 | if (AL_HAS_ROCM)
52 |   list(FILTER AL_GPU_BENCHMARK_HELPER_SOURCES INCLUDE REGEX "cu$")
53 |   set_source_files_properties(${AL_GPU_BENCHMARK_HELPER_SOURCES}
54 |     PROPERTIES LANGUAGE HIP)
55 | endif ()
56 | 


--------------------------------------------------------------------------------
/include/aluminum/internal.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | 
32 | namespace Al {
33 | 
34 | /**
35 |  * Internal implementations.
36 |  * Generic code for all collective implementations is in here.
37 |  * Implementation-specific code is in separate namespaces inside internal.
38 |  */
39 | namespace internal {
40 | 
41 | // Would be nice to replace this with a C++14 variable template...
42 | /** Indicator that an in-place allreduce is requested. */
43 | template <typename T>
44 | inline T* IN_PLACE() { return (T*) (-1); }
45 | 
46 | }  // namespace internal
47 | }  // namespace Al
48 | 


--------------------------------------------------------------------------------
/src/ht_impl.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include "aluminum/ht_impl.hpp"
29 | 
30 | namespace Al {
31 | 
32 | // Initialize this.
33 | AlGpuEvent_t HostTransferBackend::sync_event = (AlGpuEvent_t) 0;
34 | 
35 | namespace internal {
36 | namespace ht {
37 | 
38 | void init(int&, char**&) {
39 |   AL_CHECK_CUDA(AlGpuEventCreateWithFlags(&HostTransferBackend::sync_event,
40 |                                           AlGpuNoTimingEventFlags));
41 | }
42 | 
43 | void finalize() {
44 |   AL_CHECK_CUDA(AlGpuEventDestroy(HostTransferBackend::sync_event));
45 | }
46 | 
47 | }  // namespace ht
48 | }  // namespace internal
49 | }  // namespace Al
50 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/helper_kernels.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/base.hpp"
31 | #include "aluminum/cuda/cuda.hpp"
32 | 
33 | namespace Al {
34 | namespace internal {
35 | namespace cuda {
36 | 
37 | void launch_wait_kernel(AlGpuStream_t stream, int32_t wait_value,
38 |                         volatile int32_t* wait_mem);
39 | 
40 | #if defined AL_HAS_ROCM
41 | void launch_wait_kernel(hipStream_t stream,
42 |                         int32_t wait_value,
43 |                         hipDeviceptr_t wait_mem);
44 | #elif defined AL_HAS_CUDA
45 | void launch_wait_kernel(cudaStream_t stream, int32_t wait_value,
46 |                         CUdeviceptr wait_mem);
47 | #endif
48 | 
49 | } // namespace cuda
50 | } // namespace internal
51 | } // namespace Al
52 | 


--------------------------------------------------------------------------------
/include/aluminum/datatypes.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | // This file identifies support for different specialized datatypes and
31 | // provides some basic things for them as needed.
32 | 
33 | #include <Al_config.hpp>
34 | 
35 | // IEEE 16 bit floating point (i.e., fp16 or half).
36 | 
37 | #if defined AL_HAS_ROCM
38 | #include <hip/hip_fp16.h>
39 | #define AL_HAS_HALF 1
40 | #elif defined AL_HAS_CUDA
41 | #include <cuda_fp16.h>
42 | #define AL_HAS_HALF 1
43 | #endif
44 | 
45 | // Brain floating point 16 (bfloat16).
46 | 
47 | #if defined AL_HAS_ROCM
48 | #include <hip/hip_bf16.h>
49 | #define AL_HAS_BFLOAT 1
50 | using al_bfloat16 = __hip_bfloat16;
51 | 
52 | #elif defined AL_HAS_CUDA
53 | #include <cuda_bf16.h>
54 | #define AL_HAS_BFLOAT 1
55 | using al_bfloat16 = __nv_bfloat16;
56 | #endif
57 | 


--------------------------------------------------------------------------------
/include/aluminum/utils/utils.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <chrono>
31 | #include <vector>
32 | 
33 | namespace Al {
34 | 
35 | /** Return time, in seconds (with decimal), since a fixed epoch. */
36 | inline double get_time() {
37 |   using namespace std::chrono;
38 |   return duration_cast<duration<double>>(
39 |     steady_clock::now().time_since_epoch()).count();
40 | }
41 | 
42 | /**
43 |  * Compute an exclusive prefix sum.
44 |  *
45 |  * This is mostly meant to help with vector collectives.
46 |  */
47 | template <typename T>
48 | inline std::vector<T> excl_prefix_sum(const std::vector<T>& v) {
49 |   auto r = std::vector<T>(v.size(), T{0});
50 |   for (size_t i = 1; i < v.size(); ++i) {
51 |     r[i] = v[i-1] + r[i-1];
52 |   }
53 |   return r;
54 | }
55 | 
56 | }  // namespace Al
57 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/barrier.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | class BarrierAlState : public HostTransferCollectiveSignalAtEndState {
39 | public:
40 |   BarrierAlState(HostTransferCommunicator& comm_, AlGpuStream_t stream_) :
41 |     HostTransferCollectiveSignalAtEndState(stream_),
42 |     comm(comm_.get_comm()) {
43 |     // Just wait until we should start this.
44 |     start_event.record(stream_);
45 | 
46 |     // Have the device wait on the host.
47 |     gpu_wait.wait(stream_);
48 |     end_event.record(stream_);
49 |   }
50 | 
51 |   std::string get_name() const override { return "HTBarrier"; }
52 | 
53 | protected:
54 |   void start_mpi_op() override {
55 |     MPI_Ibarrier(comm, get_mpi_req());
56 |   }
57 | 
58 | private:
59 |   MPI_Comm comm;
60 | };
61 | 
62 | }  // namespace ht
63 | }  // namespace internal
64 | }  // namespace Al
65 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/events.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/utils/locked_resource_pool.hpp"
31 | #include "aluminum/cuda/cuda.hpp"
32 | 
33 | namespace Al {
34 | namespace internal {
35 | namespace cuda {
36 | 
37 | // TODO: May want to allocate larger chunks and partition.
38 | 
39 | /**
40 |  * Allocate CUDA pinned memory such that there is one allocation per
41 |  * cache line.
42 |  */
43 | struct CUDAEventAllocator {
44 |   AlGpuEvent_t allocate() {
45 |     AlGpuEvent_t event;
46 |     AL_CHECK_CUDA(
47 |       AlGpuEventCreateWithFlags(&event,
48 |                                 AlGpuNoTimingEventFlags));
49 |     return event;
50 |   }
51 | 
52 |   void deallocate(AlGpuEvent_t event) {
53 |     AL_CHECK_CUDA(AlGpuEventDestroy(event));
54 |   }
55 | };
56 | 
57 | /** Resource pool for synchronization memory. */
58 | extern Al::internal::LockedResourcePool<AlGpuEvent_t,
59 |                                         CUDAEventAllocator> event_pool;
60 | 
61 | }  // namespace cuda
62 | }  // namespace internal
63 | }  // namespace Al
64 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi_cuda/rma_null.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/mpi_cuda/communicator.hpp"
31 | #include "aluminum/mpi_cuda/rma.hpp"
32 | 
33 | namespace Al {
34 | namespace internal {
35 | namespace mpi_cuda {
36 | 
37 | class ConnectionNULL: public Connection {
38 |  public:
39 |   ConnectionNULL(MPICUDACommunicator &comm, int peer):
40 |       Connection(comm, peer) {}
41 |   ~ConnectionNULL() {}
42 |   void connect() {}
43 |   void disconnect() {}
44 |   void *attach_remote_buffer(void *) {
45 |     return nullptr;
46 |   }
47 |   void detach_remote_buffer(void *) {}
48 |   void detach_all_remote_buffers() {}
49 |   void notify(mpi::AlMPIReq &req) {
50 |     req->store(true, std::memory_order_release);
51 |   }
52 |   void wait(mpi::AlMPIReq &req) {
53 |     req->store(true, std::memory_order_release);
54 |   }
55 |   void sync(mpi::AlMPIReq &req) {
56 |     req->store(true, std::memory_order_release);
57 |   }
58 |   void put(const void *, void *, size_t) {}
59 | };
60 | 
61 | } // namespace mpi_cuda
62 | } // namespace internal
63 | } // namespace Al
64 | 


--------------------------------------------------------------------------------
/cmake/FindNCCL.cmake:
--------------------------------------------------------------------------------
 1 | # Exports the following variables
 2 | #
 3 | #   NCCL_FOUND
 4 | #   NCCL_INCLUDE_PATH
 5 | #   NCCL_LIBRARY
 6 | #
 7 | # Exports the following IMPORTED targets:
 8 | #
 9 | #   cuda::nccl
10 | #
11 | 
12 | find_path(NCCL_INCLUDE_PATH nccl.h
13 |   HINTS ${NCCL_DIR} $ENV{NCCL_DIR} ${NCCL2_DIR} $ENV{NCCL2_DIR}
14 |   ${CUDAToolkit_INCLUDE_DIRS}
15 |   PATH_SUFFIXES include
16 |   NO_DEFAULT_PATH
17 |   DOC "The location of NCCL headers."
18 |   )
19 | find_path(NCCL_INCLUDE_PATH nccl.h)
20 | 
21 | find_library(NCCL_LIBRARY nccl
22 |   HINTS ${NCCL_DIR} $ENV{NCCL_DIR} ${NCCL2_DIR} $ENV{NCCL2_DIR}
23 |   ${CUDAToolkit_LIBRARY_DIR}
24 |   PATH_SUFFIXES lib64 lib
25 |   NO_DEFAULT_PATH
26 |   DOC "The NCCL library."
27 |   )
28 | find_library(NCCL_LIBRARY nccl)
29 | 
30 | # If the include path has been found, we can test the version.
31 | if (NCCL_INCLUDE_PATH)
32 | 
33 |   # Check the version. Note, this won't compile for NCCL1
34 |   set(_NCCL_VERSION_TEST_SRC "
35 | #include <iostream>
36 | #include <nccl.h>
37 | 
38 | int main()
39 | {
40 |     std::cout << NCCL_MAJOR << \".\" << NCCL_MINOR << \".\" << NCCL_PATCH;
41 |     return 0;
42 | }
43 | ")
44 | 
45 |   file(WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx"
46 |     "${_NCCL_VERSION_TEST_SRC}\n")
47 | 
48 |   try_run(_NCCL_RUN_RESULT _NCCL_COMPILE_RESULT
49 |     ${CMAKE_BINARY_DIR}
50 |     ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx
51 |     CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${CUDAToolkit_INCLUDE_DIRS};${NCCL_INCLUDE_PATH}"
52 |     RUN_OUTPUT_VARIABLE _NCCL_VERSION_STRING
53 |     COMPILE_OUTPUT_VARIABLE _NCCL_COMPILE_OUTPUT
54 |     )
55 | 
56 |   # Assume that if it didn't compile, we have NCCL1
57 |   if (NOT _NCCL_COMPILE_RESULT)
58 |     message(${_NCCL_COMPILE_OUTPUT})
59 |     set(_NCCL_VERSION_STRING 1.0.0)
60 |   endif ()
61 | endif ()
62 | 
63 | # Standard handling of the package arguments
64 | include(FindPackageHandleStandardArgs)
65 | find_package_handle_standard_args(NCCL
66 |   REQUIRED_VARS NCCL_LIBRARY NCCL_INCLUDE_PATH
67 |   VERSION_VAR _NCCL_VERSION_STRING)
68 | 
69 | # Setup the imported target
70 | if (NCCL_FOUND AND NOT TARGET cuda::nccl)
71 | 
72 |   add_library(cuda::nccl INTERFACE IMPORTED)
73 | 
74 |   set_property(TARGET cuda::nccl PROPERTY
75 |     INTERFACE_INCLUDE_DIRECTORIES ${NCCL_INCLUDE_PATH} ${CUDA_INCLUDE_DIRS})
76 | 
77 |   set_property(TARGET cuda::nccl PROPERTY
78 |     INTERFACE_LINK_LIBRARIES ${NCCL_LIBRARY})
79 | 
80 | endif (NCCL_FOUND AND NOT TARGET cuda::nccl)
81 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/barrier.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | inline void passthrough_barrier(MPICommunicator& comm) {
40 |   MPI_Barrier(comm.get_comm());
41 | }
42 | 
43 | class BarrierAlState : public MPIState {
44 | public:
45 |   BarrierAlState(MPICommunicator& comm_, AlMPIReq req_)
46 |       : MPIState(req_), comm(comm_.get_comm()) {}
47 |   ~BarrierAlState() override {}
48 | 
49 |   std::string get_name() const override { return "MPIBarrier"; }
50 | 
51 | protected:
52 |   void start_mpi_op() override {
53 |     MPI_Ibarrier(comm, get_mpi_req());
54 |   }
55 | 
56 | private:
57 |   MPI_Comm comm;
58 | };
59 | 
60 | inline void passthrough_nb_barrier(MPICommunicator& comm, AlMPIReq& req) {
61 |   req = get_free_request();
62 |   internal::mpi::BarrierAlState* state =
63 |     new internal::mpi::BarrierAlState(comm, req);
64 |   get_progress_engine()->enqueue(state);
65 | }
66 | 
67 | } // namespace mpi
68 | } // namespace internal
69 | } // namespace Al
70 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_utils_nccl.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "Al.hpp"
31 | #include "benchmark_utils.hpp"
32 | 
33 | 
34 | template <>
35 | struct Timer<Al::NCCLBackend> {
36 |   Timer() {
37 |     AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&start_event, AlGpuDefaultEventFlags));
38 |     AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&end_event, AlGpuDefaultEventFlags));
39 |   }
40 | 
41 |   ~Timer() noexcept(false) {
42 |     AL_FORCE_CHECK_GPU(AlGpuEventDestroy(start_event));
43 |     AL_FORCE_CHECK_GPU(AlGpuEventDestroy(end_event));
44 |   }
45 | 
46 |   void start_timer(typename Al::NCCLBackend::comm_type& comm) {
47 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(start_event, comm.get_stream()));
48 |   }
49 | 
50 |   double end_timer(typename Al::NCCLBackend::comm_type &comm) {
51 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(end_event, comm.get_stream()));
52 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventSynchronize(end_event));
53 |     float elapsed_time;
54 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventElapsedTime(
55 |                                  &elapsed_time, start_event, end_event));
56 |     // Convert milliseconds to seconds.
57 |     return elapsed_time / 1000.0;
58 |   }
59 | 
60 |   AlGpuEvent_t start_event;
61 |   AlGpuEvent_t end_event;
62 | };
63 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_utils_ht.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "Al.hpp"
31 | #include "benchmark_utils.hpp"
32 | 
33 | 
34 | template <>
35 | struct Timer<Al::HostTransferBackend> {
36 |   Timer() {
37 |     AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&start_event, AlGpuDefaultEventFlags));
38 |     AL_FORCE_CHECK_GPU(AlGpuEventCreateWithFlags(&end_event, AlGpuDefaultEventFlags));
39 |   }
40 | 
41 |   ~Timer() noexcept(false) {
42 |     AL_FORCE_CHECK_GPU(AlGpuEventDestroy(start_event));
43 |     AL_FORCE_CHECK_GPU(AlGpuEventDestroy(end_event));
44 |   }
45 | 
46 |   void start_timer(typename Al::HostTransferBackend::comm_type& comm) {
47 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(start_event, comm.get_stream()));
48 |   }
49 | 
50 |   double end_timer(typename Al::HostTransferBackend::comm_type &comm) {
51 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventRecord(end_event, comm.get_stream()));
52 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventSynchronize(end_event));
53 |     float elapsed_time;
54 |     AL_FORCE_CHECK_GPU_NOSYNC(AlGpuEventElapsedTime(
55 |                                  &elapsed_time, start_event, end_event));
56 |     // Convert milliseconds to seconds.
57 |     return elapsed_time / 1000.0;
58 |   }
59 | 
60 |   AlGpuEvent_t start_event;
61 |   AlGpuEvent_t end_event;
62 | };
63 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/sync_memory.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/utils/locked_resource_pool.hpp"
31 | #include "aluminum/cuda/cuda.hpp"
32 | #include "aluminum/tuning_params.hpp"
33 | #include <cstdint>
34 | #include <cstdlib>
35 | 
36 | namespace Al {
37 | namespace internal {
38 | namespace cuda {
39 | 
40 | // TODO: May want to allocate larger chunks and partition.
41 | 
42 | /**
43 |  * Allocate CUDA pinned memory such that there is one allocation per
44 |  * cache line.
45 |  */
46 | struct CacheLinePinnedMemoryAllocator {
47 |   int32_t *allocate() {
48 |     // Overallocate to avoid interference.
49 |     int32_t *mem = (int32_t *)std::aligned_alloc(
50 |         AL_DESTRUCTIVE_INTERFERENCE_SIZE, AL_DESTRUCTIVE_INTERFERENCE_SIZE);
51 |     AL_CHECK_CUDA(AlGpuHostRegister(mem, AL_DESTRUCTIVE_INTERFERENCE_SIZE,
52 |                                    AlGpuHostRegisterDefault));
53 |     return mem;
54 |   }
55 | 
56 |   void deallocate(int32_t* mem) {
57 |     AL_CHECK_CUDA(AlGpuHostUnregister(mem));
58 |     std::free(mem);
59 |   }
60 | };
61 | 
62 | /** Resource pool for synchronization memory. */
63 | extern Al::internal::LockedResourcePool<int32_t*, CacheLinePinnedMemoryAllocator> sync_pool;
64 | 
65 | }  // namespace cuda
66 | }  // namespace internal
67 | }  // namespace Al
68 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set_source_path(TEST_BASE_HDRS
 2 |   algo_support.hpp
 3 |   test_utils.hpp
 4 |   test_utils_mpi.hpp
 5 |   op_dispatcher.hpp
 6 |   op_runner.hpp
 7 |   hang_watchdog.hpp)
 8 | if (AL_HAS_CUDA)
 9 |   set_source_path(TEST_CUDA_HDRS cuda_vector.hpp)
10 | endif ()
11 | if (AL_HAS_MPI_CUDA)
12 |   set_source_path(TEST_MPI_CUDA_HDRS test_utils_mpi_cuda.hpp)
13 | endif ()
14 | if (AL_HAS_HOST_TRANSFER)
15 |   set_source_path(TEST_HOST_TRANSFER_HDRS test_utils_ht.hpp)
16 | endif ()
17 | if (AL_HAS_NCCL)
18 |   set_source_path(TEST_NCCL_HDRS test_utils_nccl.hpp)
19 | endif ()
20 | 
21 | set(TEST_HEADERS
22 |   ${TEST_BASE_HDRS}
23 |   ${TEST_CUDA_HDRS}
24 |   ${TEST_MPI_CUDA_HDRS}
25 |   ${TEST_HOST_TRANSFER_HDRS}
26 |   ${TEST_NCCL_HDRS})
27 | 
28 | # These headers are used in the benchmark/ directory
29 | add_library(aluminum_test_headers INTERFACE "${TEST_HEADERS}")
30 | target_include_directories(
31 |   aluminum_test_headers INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}")
32 | 
33 | # Make a quick exit before we add any tests
34 | if (NOT AL_ENABLE_TESTS)
35 |   return()
36 | endif ()
37 | 
38 | set_source_path(AL_TEST_SOURCES
39 |   test_ops.cpp
40 |   test_exchange.cpp
41 | )
42 | 
43 | set_source_path(AL_GPU_ONLY_TEST_SOURCES
44 |   test_stream_mem_ops.cpp
45 | )
46 | 
47 | if (AL_HAS_CUDA OR AL_HAS_ROCM)
48 |   list(APPEND AL_TEST_SOURCES ${AL_GPU_ONLY_TEST_SOURCES})
49 | endif ()
50 | 
51 | foreach(src ${AL_TEST_SOURCES})
52 |   string(REPLACE ".cpp" "" _tmp_test_exe_name "${src}")
53 |   get_filename_component(_test_exe_name "${_tmp_test_exe_name}" NAME)
54 |   add_executable(${_test_exe_name} ${src})
55 |   target_include_directories(${_test_exe_name}
56 |     SYSTEM PRIVATE
57 |     ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include)
58 |   target_link_libraries(${_test_exe_name} PRIVATE Al aluminum_test_headers)
59 |   # FIXME: Hopefully this can be removed in a future version of ROCm.
60 |   if (AL_HAS_ROCM AND AL_BUILD_TYPE_UPPER MATCHES "DEBUG")
61 |     target_compile_options(${_test_exe_name} PRIVATE "-O0")
62 |   endif ()
63 | endforeach()
64 | 
65 | if (AL_HAS_MPI_CUDA_RMA AND NOT AL_HAS_ROCM)
66 |   add_executable(test_rma_ring.exe test_rma_ring.cpp ${TEST_HEADERS})
67 |   target_include_directories(
68 |     test_rma_ring.exe SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include)
69 |   target_link_libraries(test_rma_ring.exe PRIVATE Al)
70 |   add_executable(test_rma_halo_exchange.exe
71 |     test_rma_halo_exchange.cpp ${TEST_HEADERS})
72 |   target_include_directories(
73 |     test_rma_halo_exchange.exe SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include)
74 |   target_link_libraries(test_rma_halo_exchange.exe PRIVATE Al)
75 | endif ()
76 | 


--------------------------------------------------------------------------------
/include/aluminum/profiling.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <thread>
31 | #include <string>
32 | 
33 | #include <Al_config.hpp>
34 | 
35 | #ifdef AL_HAS_NVPROF
36 | #include <nvToolsExt.h>
37 | #endif
38 | #ifdef AL_HAS_ROCTRACER
39 | #include <roctx.h>
40 | #endif
41 | #if defined AL_HAS_ROCM
42 | #include <hip/hip_runtime.h>
43 | #elif defined AL_HAS_CUDA
44 | #include <cuda_runtime.h>
45 | #endif
46 | 
47 | namespace Al {
48 | namespace internal {
49 | namespace profiling {
50 | 
51 | /** Assign a name to the thread given by handle. */
52 | void name_thread(std::thread::native_handle_type handle, std::string name);
53 | #ifdef AL_HAS_CUDA
54 | /** Assign a name to a CUDA stream. */
55 | void name_stream(AlGpuStream_t stream, std::string name);
56 | #endif
57 | 
58 | /** Create an instantaneous marker. */
59 | void mark(std::string desc);
60 | 
61 | /** Represent a range for profiling. */
62 | struct ProfileRange {
63 | #ifdef AL_HAS_NVPROF
64 |   nvtxRangeId_t nvtx_range;
65 | #endif
66 | #ifdef AL_HAS_ROCTRACER
67 |   roctx_range_id_t roctx_range;
68 | #endif
69 | };
70 | 
71 | /** Start a profiling region with name. */
72 | ProfileRange prof_start(std::string name);
73 | /** End a profiling region. */
74 | void prof_end(ProfileRange range);
75 | 
76 | }  // namespace profiling
77 | }  // namespace internal
78 | }  // namespace Al
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Al](al.svg) Aluminum
 2 | ======================
 3 | 
 4 | **Aluminum** is a high-performance communication library for CPUs, GPUs, and other accelerator platforms.
 5 | It leverages existing libraries, such as MPI, NCCL, and RCCL, plus its own infrastructure, to deliver performance and accelerator-centric communication.
 6 | 
 7 | Aluminum is open-source and maintained by the Lawrence Livermore National Laboratory.
 8 | If you use Aluminum, please cite [our paper](https://ieeexplore.ieee.org/document/8638639):
 9 | ```
10 | @inproceedings{dryden2018aluminum,
11 |   title={Aluminum: An Asynchronous, {GPU}-Aware Communication Library Optimized for Large-Scale Training of Deep Neural Networks on {HPC} Systems},
12 |   author={Dryden, Nikoli and Maruyama, Naoya and Moon, Tim and Benson, Tom and Yoo, Andy and Snir, Marc and Van Essen, Brian},
13 |   booktitle={Proceedings of the Workshop on Machine Learning in HPC Environments (MLHPC)},
14 |   year={2018}
15 | }
16 | ```
17 | 
18 | ## Features
19 | 
20 | * Support for blocking and non-blocking collective and point-to-point operations
21 | * Accelerator-centric communication
22 | * Supported communication backends:
23 |   * `MPI`: Uses the Message Passing Interface and supports any hardware your underlying MPI library supports.
24 |   * `NCCL`: Uses either Nvidia's [NCCL](https://developer.nvidia.com/nccl) library for Nvidia GPUs or AMD's [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) library for AMD GPUs.
25 |   * `HostTransfer`: Uses MPI plus the CUDA or HIP runtime to support Nvidia or AMD GPUs without specialized libraries.
26 | 
27 | ## Getting Started
28 | 
29 | For full details, see the [Aluminum documentation](https://aluminum.readthedocs.io/).
30 | 
31 | For basic usage examples, see the [examples](examples).
32 | 
33 | ### Building and Installation
34 | 
35 | Aluminum is available via [Spack](https://spack.io/) or can be installed manually from source.
36 | 
37 | Source builds need a recent CMake, C++ compiler (with support for C++17), MPI, and hwloc.
38 | Accelerator backends need the appropriate runtime libraries.
39 | 
40 | A basic out-of-source build can be done with
41 | ```
42 | mkdir build && cd build
43 | cmake /path/to/Aluminum/source
44 | ```
45 | 
46 | For full details on building, configuration, testing, and benchmarking, see the [documentation](https://aluminum.readthedocs.io/en/latest/build.html).
47 | 
48 | ## Authors
49 | 
50 | * [Nikoli Dryden](https://github.com/ndryden)
51 | * [Naoya Maruyama](https://github.com/naoyam)
52 | * [Tom Benson](https://github.com/benson31)
53 | * Andy Yoo
54 | 
55 | See also [contributors](https://github.com/ndryden/Aluminum/graphs/contributors).
56 | 
57 | ## License
58 | 
59 | Aluminum is licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for details.
60 | 


--------------------------------------------------------------------------------
/cmake/FindRoctracer.cmake:
--------------------------------------------------------------------------------
 1 | # Sets the following variables
 2 | #
 3 | #   Roctracer_FOUND
 4 | #   Roctracer_LIBRARIES
 5 | #
 6 | # Defines the following imported target:
 7 | #
 8 | #   roctracer::roctracer
 9 | #   roctracer::roctracer_api
10 | #   roctracer::roctx_api
11 | #
12 | 
13 | set(_supported_components roctracer roctx)
14 | if (NOT Roctracer_FIND_COMPONENTS)
15 |   set(Roctracer_FIND_COMPONENTS ${_supported_components})
16 | endif ()
17 | 
18 | foreach (comp IN LISTS Roctracer_FIND_COMPONENTS)
19 |   if (NOT ${comp} IN_LIST _supported_components)
20 |     message(FATAL_ERROR
21 |       "Cannot specify component \"${comp}\" for package Roctracer. "
22 |       "Supported components are: ${_supported_components}.")
23 |   endif ()
24 | 
25 |   set(_header_name "${comp}.h")
26 |   set(_lib_name "${comp}64")
27 | 
28 |   find_path(${comp}_INCLUDE_PATH ${_header_name}
29 |     HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer
30 |     PATH_SUFFIXES include
31 |     DOC "The ${comp} include directory for roctracer."
32 |     NO_DEFAULT_PATH)
33 |   find_path(${comp}_INCLUDE_PATH ${_header_name}
34 |     HINTS ${ROCM_PATH}/include/roctracer $ENV{ROCM_PATH}/include/roctracer
35 |     DOC "The ${comp} include directory for roctracer."
36 |     NO_DEFAULT_PATH)
37 |   find_path(${comp}_INCLUDE_PATH ${_header_name})
38 | 
39 |   find_library(${comp}_LIBRARY ${_lib_name}
40 |     HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer
41 |     HINTS ${ROCM_PATH} $ENV{ROCM_PATH}
42 |     PATH_SUFFIXES lib64 lib
43 |     DOC "The ${comp} library for roctracer."
44 |     NO_DEFAULT_PATH)
45 |   find_library(${comp}_LIBRARY ${_lib_name})
46 | 
47 |   if (${comp}_LIBRARY AND ${comp}_INCLUDE_PATH)
48 |     set(Roctracer_${comp}_FOUND TRUE)
49 | 
50 |     if (NOT TARGET roctracer::${comp}_api)
51 |       add_library(roctracer::${comp}_api INTERFACE IMPORTED)
52 |     endif ()
53 |     target_link_libraries(roctracer::${comp}_api INTERFACE
54 |       "${${comp}_LIBRARY}")
55 |     target_include_directories(roctracer::${comp}_api INTERFACE
56 |       "${${comp}_INCLUDE_PATH}")
57 | 
58 |     mark_as_advanced(${comp}_LIBRARY)
59 |     mark_as_advanced(${comp}_INCLUDE_PATH)
60 | 
61 |     list(APPEND _imported_libraries roctracer::${comp}_api)
62 |   else ()
63 |     set(Roctracer_${comp}_FOUND FALSE)
64 |   endif ()
65 | endforeach ()
66 | 
67 | include(FindPackageHandleStandardArgs)
68 | find_package_handle_standard_args(Roctracer HANDLE_COMPONENTS)
69 | 
70 | if (Roctracer_FOUND)
71 |   if (NOT TARGET roctracer::roctracer)
72 |     add_library(roctracer::roctracer INTERFACE IMPORTED)
73 |   endif ()
74 |   foreach (lib IN LISTS _imported_libraries)
75 |     target_link_libraries(roctracer::roctracer INTERFACE ${lib})
76 |   endforeach ()
77 |   set(Roctracer_LIBRARIES roctracer::roctracer)
78 | endif (Roctracer_FOUND)
79 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi_cuda/communicator.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <memory>
31 | #include "Al.hpp"
32 | #include "aluminum/mpi_comm_and_stream_wrapper.hpp"
33 | #include "aluminum/mpi/communicator.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi_cuda {
38 | 
39 | #ifdef AL_HAS_MPI_CUDA_RMA
40 | class RMA;
41 | #endif
42 | 
43 | class MPICUDACommunicator: public MPICommAndStreamWrapper<cudaStream_t> {
44 |  public:
45 |   MPICUDACommunicator()
46 |     : MPICUDACommunicator(mpi::get_world_comm().get_comm(), 0) {}
47 |   MPICUDACommunicator(MPI_Comm comm_, cudaStream_t stream_)
48 |     : MPICommAndStreamWrapper(comm_, stream_)
49 | #ifdef AL_HAS_MPI_CUDA_RMA
50 |     , m_rma(nullptr)
51 | #endif
52 |   {}
53 |   MPICUDACommunicator(const MPICUDACommunicator& other) = delete;
54 |   MPICUDACommunicator(MPICUDACommunicator&& other) = default;
55 |   MPICUDACommunicator& operator=(const MPICUDACommunicator& other) = delete;
56 |   MPICUDACommunicator& operator=(MPICUDACommunicator&& other) = default;
57 | 
58 | #ifdef AL_HAS_MPI_CUDA_RMA
59 |   RMA &get_rma();
60 | #endif
61 | 
62 |   ~MPICUDACommunicator();
63 | 
64 |   MPICUDACommunicator copy(cudaStream_t stream = 0) const {
65 |     return MPICUDACommunicator(get_comm(), stream);
66 |   }
67 | 
68 |  protected:
69 | #ifdef AL_HAS_MPI_CUDA_RMA
70 |   std::shared_ptr<RMA> m_rma;
71 | #endif
72 | };
73 | 
74 | } // namespace mpi_cuda
75 | } // namespace internal
76 | } // namespace Al
77 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi_cuda/rma_self.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/mpi_cuda/communicator.hpp"
31 | #include "aluminum/mpi_cuda/rma.hpp"
32 | 
33 | namespace Al {
34 | namespace internal {
35 | namespace mpi_cuda {
36 | 
37 | class ConnectionSelf: public Connection {
38 |  public:
39 |   ConnectionSelf(MPICUDACommunicator &comm, int peer):
40 |       Connection(comm, peer) {}
41 |   ~ConnectionSelf() {}
42 |   void connect() {}
43 |   void disconnect() {}
44 |   void *attach_remote_buffer(void *local_addr) {
45 |     return local_addr;
46 |   }
47 |   void detach_remote_buffer(void *) {}
48 |   void detach_all_remote_buffers() {}
49 |   void notify(mpi::AlMPIReq &req) {
50 |     req->store(true, std::memory_order_release);
51 |   }
52 |   void wait(mpi::AlMPIReq &req) {
53 |     req->store(true, std::memory_order_release);
54 |   }
55 |   void sync(mpi::AlMPIReq &req) {
56 |     req->store(true, std::memory_order_release);
57 |   }
58 |   void put(const void *src, void *dst,
59 |            size_t size) {
60 |     if (size > 0) {
61 |       if (src == nullptr) {
62 |         throw_al_exception("Source buffer is null");
63 |       }
64 |       if (dst == nullptr) {
65 |         throw_al_exception("Destination buffer is null");
66 |       }
67 |       AL_CHECK_CUDA(cudaMemcpyAsync(
68 |           dst, src, size, cudaMemcpyDefault, m_comm.get_stream()));
69 |     }
70 |   }
71 | };
72 | 
73 | } // namespace mpi_cuda
74 | } // namespace internal
75 | } // namespace Al
76 | 


--------------------------------------------------------------------------------
/src/cuda/helper_kernels.cu:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include <Al_config.hpp>
29 | 
30 | #if defined AL_HAS_ROCM
31 | #include <hip/hip_runtime.h>
32 | #elif defined AL_HAS_CUDA
33 | #include <cuda_runtime.h>
34 | #endif
35 | 
36 | #include "aluminum/cuda/helper_kernels.hpp"
37 | 
38 | namespace Al {
39 | namespace internal {
40 | namespace cuda {
41 | 
42 | __global__ void spin_wait_kernel(int32_t wait_value, volatile int32_t* wait_mem) {
43 |   for (;;)
44 |   {
45 |     __threadfence_system();
46 |     int32_t value = *wait_mem;
47 |     if (value == wait_value) break;
48 |   }
49 | }
50 | 
51 | void launch_wait_kernel(AlGpuStream_t stream,
52 |                         int32_t wait_value,
53 |                         volatile int32_t* wait_mem) {
54 |   spin_wait_kernel<<<1,1,0,stream>>>(wait_value, wait_mem);
55 | }
56 | 
57 | #if defined AL_HAS_ROCM
58 | void launch_wait_kernel(hipStream_t stream,
59 |                         int32_t wait_value,
60 |                         hipDeviceptr_t wait_mem) {
61 |   AL_CHECK_CUDA(hipStreamWaitValue32(
62 |                       stream, wait_mem, wait_value, hipStreamWaitValueEq));
63 | }
64 | #elif defined AL_HAS_CUDA
65 | void launch_wait_kernel(cudaStream_t stream,
66 |                         int32_t wait_value,
67 |                         CUdeviceptr wait_mem) {
68 |   AL_CHECK_CUDA_DRV(cuStreamWaitValue32(
69 |                       stream, wait_mem, wait_value, CU_STREAM_WAIT_VALUE_EQ));
70 | }
71 | #endif
72 | 
73 | } // namespace cuda
74 | } // namespace internal
75 | } // namespace Al
76 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/gpu_wait.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <Al_config.hpp>
31 | 
32 | #include <cstdint>
33 | 
34 | #if defined AL_HAS_ROCM
35 | #include <hip/hip_runtime.h>
36 | #elif defined AL_HAS_CUDA
37 | #include <cuda.h>
38 | #include <cuda_runtime.h>
39 | #endif
40 | 
41 | namespace Al {
42 | namespace internal {
43 | namespace cuda {
44 | 
45 | /**
46 |  * Have a GPU stream block until signalled.
47 |  * This essentially uses full/empty bit semantics to implement synchronization.
48 |  * The GPU will wait on a memory location until the host writes to it using the
49 |  * stream memory wait operation.
50 |  *
51 |  * If stream memory operations are not available, this will use a
52 |  * spinning wait kernel. This can cause problems. It has a tendency to
53 |  * lead to deadlock, especially in "debug" mode. Also, if kernel
54 |  * timeout is enabled, this is likely to error out.
55 |  */
56 | class GPUWait {
57 |  public:
58 |   GPUWait();
59 |   ~GPUWait();
60 |   /** Enqueue a wait onto stream. */
61 |   void wait(AlGpuStream_t stream);
62 |   /** Signal the stream to continue. */
63 |   void signal();
64 |  private:
65 |   int32_t* wait_sync __attribute__((aligned(64)));
66 |   union {
67 |     int32_t *wait_sync_dev_ptr_no_stream_mem_ops __attribute__((aligned(64)));
68 | #if defined AL_HAS_ROCM
69 |     hipDeviceptr_t wait_sync_dev_ptr;
70 | #elif defined AL_HAS_CUDA
71 |     CUdeviceptr wait_sync_dev_ptr;
72 | #endif
73 |   };
74 | };
75 | 
76 | }  // namespace cuda
77 | }  // namespace internal
78 | }  // namespace Al
79 | 


--------------------------------------------------------------------------------
/src/cuda/gpu_wait.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include "aluminum/cuda/gpu_wait.hpp"
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/cuda/helper_kernels.hpp"
32 | #include "aluminum/cuda/sync_memory.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace cuda {
37 | 
38 | GPUWait::GPUWait()
39 |   : wait_sync(sync_pool.get())
40 | {
41 |   // An atomic here may be overkill.
42 |   // Can't use std::atomic because we need the actual address.
43 |   __atomic_store_n(wait_sync, 0, __ATOMIC_SEQ_CST);
44 | 
45 |   if (stream_memory_operations_supported()) {
46 | #if defined AL_HAS_ROCM
47 |     AL_CHECK_CUDA(
48 |       hipHostGetDevicePointer(&wait_sync_dev_ptr, wait_sync, 0));
49 | #elif defined AL_HAS_CUDA
50 |     AL_CHECK_CUDA_DRV(
51 |         cuMemHostGetDevicePointer(&wait_sync_dev_ptr, wait_sync, 0));
52 | #endif
53 |   } else {
54 |     AL_CHECK_CUDA(AlGpuHostGetDevicePointer(
55 |         reinterpret_cast<void **>(&wait_sync_dev_ptr_no_stream_mem_ops),
56 |         wait_sync, 0));
57 |   }
58 | }
59 | 
60 | GPUWait::~GPUWait() {
61 |   sync_pool.release(wait_sync);
62 | }
63 | 
64 | void GPUWait::wait(AlGpuStream_t stream) {
65 |   if (stream_memory_operations_supported()) {
66 |     launch_wait_kernel(stream, 1, wait_sync_dev_ptr);
67 |   } else {
68 |     launch_wait_kernel(stream, 1, wait_sync_dev_ptr_no_stream_mem_ops);
69 |   }
70 | }
71 | 
72 | void GPUWait::signal() {
73 |   __atomic_store_n(wait_sync, 1, __ATOMIC_SEQ_CST);
74 | }
75 | 
76 | }  // namespace cuda
77 | }  // namespace internal
78 | }  // namespace Al
79 | 


--------------------------------------------------------------------------------
/benchmark/wait.cu:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | #include <Al_config.hpp>
28 | 
29 | #if defined AL_HAS_ROCM
30 | #include <hip/hip_runtime.h>
31 | #elif defined AL_HAS_CUDA
32 | #include <cuda.h>
33 | #include <cuda_runtime.h>
34 | #endif
35 | 
36 | namespace {
37 | 
38 | __global__ void wait_kernel(long long int cycles) {
39 |   // Doesn't handle the clock wrapping.
40 |   // Seems to wait longer than expected, but not an issue right now.
41 |   const long long int start = clock64();
42 |   long long int cur;
43 |   do {
44 |     cur = clock64();
45 |   } while (cur - start < cycles);
46 | }
47 | 
48 | }  // anonymous namespace
49 | 
50 | #if defined AL_HAS_ROCM
51 | #define AlGpuDevAttrClockRate hipDeviceAttributeClockRate
52 | #elif defined AL_HAS_CUDA
53 | #define AlGpuDevAttrClockRate cudaDevAttrClockRate
54 | #endif
55 | 
56 | void gpu_wait(double length, AlGpuStream_t stream) {
57 |   // Need to figure out frequency to convert seconds to cycles.
58 |   // Might not be exactly accurate (especially w/ dynamic frequencies).
59 |   // Cache this (unlikely we run on devices with different frequencies.)
60 |   static long long int freq_hz = 0;
61 |   if (freq_hz == 0) {
62 |     int device, freq_khz;
63 |     static_cast<void>(AlGpuGetDevice(&device));
64 |     static_cast<void>(AlGpuDeviceGetAttribute(&freq_khz,
65 |                                               AlGpuDevAttrClockRate,
66 |                                               device));
67 |     freq_hz = (long long int) freq_khz * 1000;  // Convert from KHz.
68 |   }
69 |   double cycles = length * freq_hz;
70 |   wait_kernel<<<1, 1, 0, stream>>>((long long int) cycles);
71 | }
72 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/base_state.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <atomic>
31 | #include <memory>
32 | #include <mpi.h>
33 | #include "aluminum/progress.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | using AlMPIReq = std::shared_ptr<std::atomic<bool>>;
40 | 
41 | /** Return a free request for use. */
42 | inline AlMPIReq get_free_request() {
43 |   return std::make_shared<std::atomic<bool>>(false);
44 | }
45 | 
46 | class MPIState : public AlState {
47 | public:
48 |   MPIState(AlMPIReq req_) : req(req_) {}
49 | 
50 |   void start() override {
51 |     AlState::start();
52 |     start_mpi_op();
53 |   }
54 | 
55 |   PEAction step() override {
56 |     if (poll_mpi()) {
57 |       // Mark the request as completed.
58 |       req->store(true, std::memory_order_release);
59 |       return PEAction::complete;
60 |     } else {
61 |       return PEAction::cont;
62 |     }
63 |   }
64 | 
65 | protected:
66 |   /** Start the MPI operation and set the request. */
67 |   virtual void start_mpi_op() = 0;
68 |   /** Return the MPI request that will be polled on. */
69 |   MPI_Request* get_mpi_req() { return &mpi_req; }
70 |   /** Return true when the MPI operation is complete. */
71 |   virtual bool poll_mpi() {
72 |     int flag;
73 |     MPI_Test(get_mpi_req(), &flag, MPI_STATUS_IGNORE);
74 |     return flag;
75 |   }
76 | 
77 | private:
78 |   /** Copy of the user's request object. */
79 |   AlMPIReq req;
80 |   /** MPI request associated with the operation. */
81 |   MPI_Request mpi_req = MPI_REQUEST_NULL;
82 | };
83 | 
84 | }  // namespace mpi
85 | }  // namespace internal
86 | }  // namespace Al
87 | 


--------------------------------------------------------------------------------
/test/test_utils_mpi_cuda.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "Al.hpp"
31 | 
32 | #include "test_utils.hpp"
33 | #include "test_utils_mpi.hpp"
34 | #include "cuda_vector.hpp"
35 | 
36 | 
37 | template <typename T>
38 | struct VectorType<T, Al::MPICUDABackend> {
39 |   using type = CUDAVector<T>;
40 | 
41 |   static type gen_data(size_t count, cudaStream_t stream = 0) {
42 |     auto&& host_data = VectorType<T, Al::MPIBackend>::gen_data(count);
43 |     CUDAVector<T> data(host_data, stream);
44 |     return data;
45 |   }
46 | 
47 |   static std::vector<T> copy_to_host(const type& v) {
48 |     return v.copyout();
49 |   }
50 | };
51 | 
52 | // Specialize to use the Aluminum stream pool, and size it appropriately.
53 | template <>
54 | struct StreamManager<Al::MPICUDABackend> {
55 |   using StreamType = cudaStream_t;
56 | 
57 |   static void init(size_t num_streams) {
58 |     Al::internal::cuda::stream_pool.clear();
59 |     Al::internal::cuda::stream_pool.allocate(num_streams);
60 |   }
61 |   static void finalize() {}
62 |   static StreamType get_stream() {
63 |     return Al::internal::cuda::stream_pool.get_stream();
64 |   }
65 | };
66 | 
67 | // Specialize to create a CUDA stream with the communicator.
68 | template <>
69 | CommWrapper<Al::MPICUDABackend>::CommWrapper(MPI_Comm mpi_comm) {
70 |   comm_ = std::make_unique<typename Al::MPICUDABackend::comm_type>(
71 |     mpi_comm, StreamManager<Al::MPICUDABackend>::get_stream());
72 | }
73 | 
74 | template <>
75 | void complete_operations<Al::MPICUDABackend>(
76 |   typename Al::MPICUDABackend::comm_type& comm) {
77 |   AL_FORCE_CHECK_GPU_NOSYNC(cudaStreamSynchronize(comm.get_stream()));
78 | }
79 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/cuda_mempool.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <Al_config.hpp>
31 | 
32 | #include "aluminum/utils/caching_allocator.hpp"
33 | #include "aluminum/cuda/cuda.hpp"
34 | #if defined AL_HAS_ROCM
35 | #include <hipcub/hipcub.hpp>
36 | #define AL_CUB_NS hipcub
37 | #elif defined AL_HAS_CUDA
38 | #include <cub/util_allocator.cuh>
39 | #define AL_CUB_NS cub
40 | #endif
41 | 
42 | namespace Al {
43 | namespace internal {
44 | 
45 | /** Allocator for pinned host memory. */
46 | struct CUDAPinnedMemoryAllocator {
47 |   void* allocate(size_t bytes) {
48 |     void* ptr;
49 |     AL_CHECK_CUDA(AlGpuMallocHost(&ptr, bytes));
50 |     return ptr;
51 |   }
52 | 
53 |   void deallocate(void* ptr) {
54 |     AL_CHECK_CUDA(AlGpuFreeHost(ptr));
55 |   }
56 | };
57 | 
58 | /** Specialized caching allocator for CUDA using CUB. */
59 | template <>
60 | class CachingAllocator<MemoryType::CUDA, void, void> {
61 | public:
62 |   CachingAllocator() : cub_pool(2u) {}
63 | 
64 |   ~CachingAllocator() {
65 |     clear();
66 |   }
67 | 
68 |   template <typename T>
69 |   T* allocate(size_t size, AlGpuStream_t stream) {
70 |     T* mem = nullptr;
71 |     AL_CHECK_CUDA(cub_pool.DeviceAllocate(reinterpret_cast<void**>(&mem),
72 |                                           sizeof(T)*size, stream));
73 |     return mem;
74 |   }
75 | 
76 |   template <typename T>
77 |   void release(T* ptr) {
78 |     AL_CHECK_CUDA(cub_pool.DeviceFree(ptr));
79 |   }
80 | 
81 |   void clear() { AL_IGNORE_NODISCARD(cub_pool.FreeAllCached()); }
82 | 
83 | private:
84 |   AL_CUB_NS::CachingDeviceAllocator cub_pool;
85 | };
86 | 
87 | }  // namespace internal
88 | }  // namespace Al
89 | 


--------------------------------------------------------------------------------
/util/al_info.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include <Al_config.hpp>
29 | #include <aluminum/datatypes.hpp>
30 | 
31 | #include <iostream>
32 | 
33 | 
34 | int main(int, char**) {
35 |   std::cout << "Aluminum "
36 |             << AL_VERSION
37 |             << " (" << AL_GIT_VERSION << ")\n";
38 | 
39 |   std::cout << "Backends:";
40 |   std::cout << " mpi";  // MPI is always present.
41 | #ifdef AL_HAS_NCCL
42 |   std::cout << " nccl";
43 | #endif
44 | #ifdef AL_HAS_HOST_TRANSFER
45 |   std::cout << " ht";
46 | #endif
47 | #ifdef AL_HAS_MPI_CUDA
48 |   std::cout << " mpi-cuda";
49 | #endif
50 |   std::cout << "\n";
51 |   std::cout << "Features:";
52 | #ifdef AL_DEBUG
53 |   std::cout << " debug";
54 | #endif
55 | #ifdef AL_THREAD_MULTIPLE
56 |   std::cout << " thread-multiple";
57 | #endif
58 | #ifdef AL_HAS_CUDA
59 |   std::cout << " cuda";
60 | #endif
61 | #ifdef AL_HAS_ROCM
62 |   std::cout << " rocm";
63 | #endif
64 | #ifdef AL_HAS_MPI_CUDA_RMA
65 |   std::cout << " mpi-cuda-rma";
66 | #endif
67 | #ifdef AL_DEBUG_HANG_CHECK
68 |   std::cout << " hang-check";
69 | #endif
70 | #ifdef AL_HAS_PROF
71 |   std::cout << " prof";
72 | #endif
73 | #ifdef AL_HAS_NVPROF
74 |   std::cout << " nvprof";
75 | #endif
76 | #ifdef AL_HAS_ROCTRACER
77 |   std::cout << " roctracer";
78 | #endif
79 | #ifdef AL_TRACE
80 |   std::cout << " trace";
81 | #endif
82 | #ifdef AL_MPI_SERIALIZE
83 |   std::cout << " mpi-serialize";
84 | #endif
85 | #ifdef AL_HAS_HALF
86 |   std::cout << " half";
87 | #endif
88 | #ifdef AL_HAS_BFLOAT
89 |   std::cout << " bfloat";
90 | #endif
91 | #ifdef AL_HAS_LARGE_COUNT_MPI
92 |   std::cout << " mpi-large-count";
93 | #endif
94 |   std::cout << std::endl;
95 |   return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/gpu_status_flag.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <Al_config.hpp>
31 | 
32 | #include <cstdint>
33 | 
34 | #if defined AL_HAS_ROCM
35 | #include <hip/hip_runtime.h>
36 | #elif defined AL_HAS_CUDA
37 | #include <cuda.h>
38 | #include <cuda_runtime.h>
39 | #endif
40 | 
41 | namespace Al {
42 | namespace internal {
43 | namespace cuda {
44 | 
45 | /**
46 |  * An optimized version of CUDA events that only supports polling from the host.
47 |  * This essentially uses full/empty bit semantics to implement synchronization.
48 |  * A memory location is polled on by the host and written to by the device
49 |  * using the stream memory write operation.
50 |  * This falls back to the usual CUDA events when stream memory operations are
51 |  * not available.
52 |  * @note This is currently always falling back on CUDA events to work around a
53 |  * hang, the underlying cause of which has not been diagnosed.
54 |  */
55 | class GPUStatusFlag {
56 |  public:
57 |   /**
58 |    * Allocate the event.
59 |    */
60 |   GPUStatusFlag();
61 |   ~GPUStatusFlag();
62 |   /** Record the event into stream. */
63 |   void record(AlGpuStream_t stream);
64 |   /** Return true if the event has completed. */
65 |   bool query();
66 |  private:
67 |   struct stream_mem_t {
68 |     int32_t* sync_event __attribute__((aligned(64)));
69 | #if defined AL_HAS_ROCM
70 |     hipDeviceptr_t sync_event_dev_ptr;
71 | #elif defined AL_HAS_CUDA
72 |     CUdeviceptr sync_event_dev_ptr;
73 | #endif
74 |   };
75 |   union {
76 |     stream_mem_t stream_mem;
77 |     AlGpuEvent_t plain_event;
78 |   };
79 | };
80 | 
81 | }  // namespace cuda
82 | }  // namespace internal
83 | }  // namespace Al
84 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/bcast.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | template <typename T>
40 | void passthrough_bcast(T* buf, size_t count, int root,
41 |                        MPICommunicator& comm) {
42 |   AL_MPI_LARGE_COUNT_CALL(MPI_Bcast)(
43 |     buf, count, TypeMap<T>(), root, comm.get_comm());
44 | }
45 | 
46 | template <typename T>
47 | class BcastAlState : public MPIState {
48 | public:
49 |   BcastAlState(T* buf_, size_t count_, int root_,
50 |                MPICommunicator& comm_, AlMPIReq req_) :
51 |     MPIState(req_),
52 |     buf(buf_), count(count_), root(root_),
53 |     comm(comm_.get_comm()) {}
54 | 
55 |   ~BcastAlState() override {}
56 | 
57 |   std::string get_name() const override { return "MPIBcast"; }
58 | 
59 | protected:
60 |   void start_mpi_op() override {
61 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ibcast)(
62 |       buf, count, TypeMap<T>(), root, comm, get_mpi_req());
63 |   }
64 | 
65 | private:
66 |   T* buf;
67 |   size_t count;
68 |   int root;
69 |   MPI_Comm comm;
70 | };
71 | 
72 | template <typename T>
73 | void passthrough_nb_bcast(T* buf, size_t count, int root,
74 |                           MPICommunicator& comm, AlMPIReq& req) {
75 |   req = get_free_request();
76 |   internal::mpi::BcastAlState<T>* state =
77 |     new internal::mpi::BcastAlState<T>(
78 |       buf, count, root, comm, req);
79 |   get_progress_engine()->enqueue(state);
80 | }
81 | 
82 | }  // namespace mpi
83 | }  // namespace internal
84 | }  // namespace Al
85 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/communicator.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include <memory>
31 | #include <Al_config.hpp>
32 | #include "aluminum/mpi_comm_and_stream_wrapper.hpp"
33 | #include "aluminum/mpi/communicator.hpp"
34 | #include "aluminum/mpi/utils.hpp"
35 | 
36 | namespace Al {
37 | namespace internal {
38 | namespace ht {
39 | 
40 | // Define the tag that point-to-point operations will use.
41 | constexpr int pt2pt_tag = 2;
42 | 
43 | /** Communicator for host-transfer operations. */
44 | class HostTransferCommunicator: public MPICommAndStreamWrapper<AlGpuStream_t> {
45 |  public:
46 |   /** Use Aluminum's world and the default CUDA stream. */
47 |   HostTransferCommunicator()
48 |     : HostTransferCommunicator(mpi::get_world_comm().get_comm(), 0) {}
49 |   /** Use a particular MPI communicator and stream. */
50 |   HostTransferCommunicator(MPI_Comm comm_, AlGpuStream_t stream_ = 0)
51 |     : MPICommAndStreamWrapper(comm_, stream_) {}
52 |   /** Cannot copy this. */
53 |   HostTransferCommunicator(const HostTransferCommunicator& other) = delete;
54 |   /** Default move constructor. */
55 |   HostTransferCommunicator(HostTransferCommunicator&& other) = default;
56 |   /** Cannot copy this. */
57 |   HostTransferCommunicator& operator=(const HostTransferCommunicator& other) = delete;
58 |   /** Default move assignment operator. */
59 |   HostTransferCommunicator& operator=(HostTransferCommunicator&& other) = default;
60 |   ~HostTransferCommunicator() {}
61 | 
62 |   /**
63 |    * Create a new HostTransfer communicator with the same processes
64 |    * and a new stream.
65 |    */
66 |   HostTransferCommunicator copy(AlGpuStream_t stream = 0) {
67 |     return HostTransferCommunicator(get_comm(), stream);
68 |   }
69 | };
70 | 
71 | } // namespace ht
72 | } // namespace internal
73 | } // namespace Al
74 | 


--------------------------------------------------------------------------------
/test/test_utils_nccl.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "Al.hpp"
31 | #include "aluminum/traits/traits.hpp"
32 | 
33 | #include "test_utils.hpp"
34 | #include "test_utils_mpi.hpp"
35 | #include "cuda_vector.hpp"
36 | 
37 | 
38 | template <typename T>
39 | struct VectorType<T, Al::NCCLBackend> {
40 |   using type = CUDAVector<T>;
41 | 
42 |   static type gen_data(size_t count, AlGpuStream_t stream = 0) {
43 |     auto&& host_data = VectorType<T, Al::MPIBackend>::gen_data(count);
44 |     CUDAVector<T> data(host_data, stream);
45 |     return data;
46 |   }
47 | 
48 |   static std::vector<T> copy_to_host(const type& v) {
49 |     return v.copyout();
50 |   }
51 | };
52 | 
53 | // Specialize to use the Aluminum stream pool, and size it appropriately.
54 | template <>
55 | struct StreamManager<Al::NCCLBackend> {
56 |   using StreamType = AlGpuStream_t;
57 | 
58 |   static void init(size_t num_streams) {
59 |     get_stream_pool().allocate(num_streams);
60 |   }
61 |   static void finalize() {
62 |     get_stream_pool().clear();
63 |   }
64 |   static StreamType get_stream() {
65 |     return get_stream_pool().get_stream();
66 |   }
67 | 
68 | private:
69 |   static Al::internal::cuda::StreamPool& get_stream_pool() {
70 |    static Al::internal::cuda::StreamPool streams;
71 |    return streams;
72 |   }
73 | };
74 | 
75 | // Specialize to create a CUDA stream with the communicator.
76 | template <>
77 | CommWrapper<Al::NCCLBackend>::CommWrapper(MPI_Comm mpi_comm) {
78 |   comm_ = std::make_unique<typename Al::NCCLBackend::comm_type>(
79 |     mpi_comm, StreamManager<Al::NCCLBackend>::get_stream());
80 | }
81 | 
82 | template <>
83 | void complete_operations<Al::NCCLBackend>(
84 |   typename Al::NCCLBackend::comm_type& comm) {
85 |   AL_FORCE_CHECK_GPU_NOSYNC(AlGpuStreamSynchronize(comm.get_stream()));
86 | }
87 | 


--------------------------------------------------------------------------------
/test/test_utils_ht.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "Al.hpp"
31 | #include "aluminum/traits/traits.hpp"
32 | 
33 | #include "test_utils.hpp"
34 | #include "test_utils_mpi.hpp"
35 | #include "cuda_vector.hpp"
36 | 
37 | 
38 | template <typename T>
39 | struct VectorType<T, Al::HostTransferBackend> {
40 |   using type = CUDAVector<T>;
41 | 
42 |   static type gen_data(size_t count, AlGpuStream_t stream = 0) {
43 |     auto&& host_data = VectorType<T, Al::MPIBackend>::gen_data(count);
44 |     CUDAVector<T> data(host_data, stream);
45 |     return data;
46 |   }
47 | 
48 |   static std::vector<T> copy_to_host(const type& v) {
49 |     return v.copyout();
50 |   }
51 | };
52 | 
53 | // Specialize to use the Aluminum stream pool, and size it appropriately.
54 | template <>
55 | struct StreamManager<Al::HostTransferBackend> {
56 |   using StreamType = AlGpuStream_t;
57 | 
58 |   static void init(size_t num_streams) {
59 |     get_stream_pool().allocate(num_streams);
60 |   }
61 |   static void finalize() {
62 |     get_stream_pool().clear();
63 |   }
64 |   static StreamType get_stream() {
65 |     return get_stream_pool().get_stream();
66 |   }
67 | 
68 | private:
69 |   static Al::internal::cuda::StreamPool& get_stream_pool() {
70 |    static Al::internal::cuda::StreamPool streams;
71 |    return streams;
72 |   }
73 | };
74 | 
75 | // Specialize to create a CUDA stream with the communicator.
76 | template <>
77 | CommWrapper<Al::HostTransferBackend>::CommWrapper(MPI_Comm mpi_comm) {
78 |   comm_ = std::make_unique<typename Al::HostTransferBackend::comm_type>(
79 |     mpi_comm, StreamManager<Al::HostTransferBackend>::get_stream());
80 | }
81 | 
82 | template <>
83 | void complete_operations<Al::HostTransferBackend>(
84 |   typename Al::HostTransferBackend::comm_type& comm) {
85 |   AL_FORCE_CHECK_GPU_NOSYNC(AlGpuStreamSynchronize(comm.get_stream()));
86 | }
87 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/alltoall.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | template <typename T>
39 | class AlltoallAlState : public HostTransferCollectiveSignalAtEndState {
40 | public:
41 |   AlltoallAlState(const T* sendbuf, T* recvbuf, size_t count_,
42 |                   HostTransferCommunicator& comm_, AlGpuStream_t stream_) :
43 |     HostTransferCollectiveSignalAtEndState(stream_),
44 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(comm_.size()*count_)),
45 |     count(count_),
46 |     comm(comm_.get_comm()) {
47 |     // Transfer data from device to host.
48 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count*comm_.size(),
49 |                                   AlGpuMemcpyDeviceToHost, stream_));
50 |     start_event.record(stream_);
51 | 
52 |     // Have the device wait on the host.
53 |     gpu_wait.wait(stream_);
54 | 
55 |     // Transfer completed buffer back to device.
56 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count*comm_.size(),
57 |                                   AlGpuMemcpyHostToDevice, stream_));
58 |     end_event.record(stream_);
59 |   }
60 | 
61 |   ~AlltoallAlState() override {
62 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
63 |   }
64 | 
65 |   std::string get_name() const override { return "HTAlltoall"; }
66 | 
67 | protected:
68 |   void start_mpi_op() override {
69 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ialltoall)(
70 |       MPI_IN_PLACE, count, mpi::TypeMap<T>(),
71 |       host_mem, count, mpi::TypeMap<T>(), comm, get_mpi_req());
72 |   }
73 | 
74 | private:
75 |   T* host_mem;
76 |   size_t count;
77 |   MPI_Comm comm;
78 | };
79 | 
80 | }  // namespace ht
81 | }  // namespace internal
82 | }  // namespace Al
83 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/alltoall.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | template <typename T>
40 | void passthrough_alltoall(const T* sendbuf, T* recvbuf, size_t count,
41 |                           MPICommunicator& comm) {
42 |   AL_MPI_LARGE_COUNT_CALL(MPI_Alltoall)(
43 |     buf_or_inplace(sendbuf), count, TypeMap<T>(),
44 |     recvbuf, count, TypeMap<T>(), comm.get_comm());
45 | }
46 | 
47 | template <typename T>
48 | class AlltoallAlState : public MPIState {
49 | public:
50 |   AlltoallAlState(const T* sendbuf_, T* recvbuf_, size_t count_,
51 |                   MPICommunicator& comm_, AlMPIReq req_) :
52 |     MPIState(req_),
53 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_),
54 |     comm(comm_.get_comm()) {}
55 | 
56 |   ~AlltoallAlState() override {}
57 | 
58 |   std::string get_name() const override { return "MPIAlltoall"; }
59 | 
60 | protected:
61 |   void start_mpi_op() override {
62 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ialltoall)(
63 |       buf_or_inplace(sendbuf), count, TypeMap<T>(),
64 |       recvbuf, count, TypeMap<T>(), comm, get_mpi_req());
65 |   }
66 | 
67 | private:
68 |   const T* sendbuf;
69 |   T* recvbuf;
70 |   size_t count;
71 |   MPI_Comm comm;
72 | };
73 | 
74 | template <typename T>
75 | void passthrough_nb_alltoall(const T* sendbuf, T* recvbuf, size_t count,
76 |                               MPICommunicator& comm, AlMPIReq& req) {
77 |   req = get_free_request();
78 |   internal::mpi::AlltoallAlState<T>* state =
79 |     new internal::mpi::AlltoallAlState<T>(
80 |       sendbuf, recvbuf, count, comm, req);
81 |   get_progress_engine()->enqueue(state);
82 | }
83 | 
84 | }  // namespace mpi
85 | }  // namespace internal
86 | }  // namespace Al
87 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/allgather.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | template <typename T>
40 | void passthrough_allgather(const T* sendbuf, T* recvbuf, size_t count,
41 |                            MPICommunicator& comm) {
42 |   AL_MPI_LARGE_COUNT_CALL(MPI_Allgather)(
43 |     buf_or_inplace(sendbuf), count, TypeMap<T>(),
44 |     recvbuf, count, TypeMap<T>(), comm.get_comm());
45 | }
46 | 
47 | template <typename T>
48 | class AllgatherAlState : public MPIState {
49 | public:
50 |   AllgatherAlState(const T* sendbuf_, T* recvbuf_, size_t count_,
51 |                    MPICommunicator& comm_, AlMPIReq req_) :
52 |     MPIState(req_),
53 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_),
54 |     comm(comm_.get_comm()) {}
55 | 
56 |   ~AllgatherAlState() override {}
57 | 
58 |   std::string get_name() const override { return "MPIAllgather"; }
59 | 
60 | protected:
61 |   void start_mpi_op() override {
62 |     AL_MPI_LARGE_COUNT_CALL(MPI_Iallgather)(
63 |       buf_or_inplace(sendbuf), count, TypeMap<T>(),
64 |       recvbuf, count, TypeMap<T>(), comm, get_mpi_req());
65 |   }
66 | 
67 | private:
68 |   const T* sendbuf;
69 |   T* recvbuf;
70 |   size_t count;
71 |   MPI_Comm comm;
72 | };
73 | 
74 | template <typename T>
75 | void passthrough_nb_allgather(const T* sendbuf, T* recvbuf, size_t count,
76 |                               MPICommunicator& comm, AlMPIReq& req) {
77 |   req = get_free_request();
78 |   internal::mpi::AllgatherAlState<T>* state =
79 |     new internal::mpi::AllgatherAlState<T>(
80 |       sendbuf, recvbuf, count, comm, req);
81 |   get_progress_engine()->enqueue(state);
82 | }
83 | 
84 | }  // namespace mpi
85 | }  // namespace internal
86 | }  // namespace Al
87 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/bcast.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | template <typename T>
39 | class BcastAlState : public HostTransferCollectiveSignalRootEarlyState {
40 | public:
41 |   BcastAlState(T* buf, size_t count_, int root_,
42 |                HostTransferCommunicator& comm_, AlGpuStream_t stream_) :
43 |     HostTransferCollectiveSignalRootEarlyState(comm_.rank() == root_, stream_),
44 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(count_)),
45 |     count(count_),
46 |     root(root_),
47 |     comm(comm_.get_comm()) {
48 |     // Transfer data from device to host.
49 |     if (is_root) {
50 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(
51 |                       host_mem, buf, sizeof(T)*count,
52 |                       AlGpuMemcpyDeviceToHost, stream_));
53 |     }
54 |     start_event.record(stream_);
55 | 
56 |     // Have the device wait on the host.
57 |     gpu_wait.wait(stream_);
58 | 
59 |     if (!is_root) {
60 |       // Transfer completed buffer back to device.
61 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(buf, host_mem, sizeof(T)*count,
62 |                                     AlGpuMemcpyHostToDevice, stream_));
63 |     }
64 |     end_event.record(stream_);
65 |   }
66 | 
67 |   ~BcastAlState() override {
68 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
69 |   }
70 | 
71 |   std::string get_name() const override { return "HTBcast"; }
72 | 
73 | protected:
74 |   void start_mpi_op() override {
75 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ibcast)(
76 |       host_mem, count, mpi::TypeMap<T>(), root, comm, get_mpi_req());
77 |   }
78 | 
79 | private:
80 |   T* host_mem;
81 |   size_t count;
82 |   int root;
83 |   MPI_Comm comm;
84 | };
85 | 
86 | }  // namespace ht
87 | }  // namespace internal
88 | }  // namespace Al
89 | 


--------------------------------------------------------------------------------
/src/profiling.cpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #include "aluminum/profiling.hpp"
 29 | 
 30 | #include <Al_config.hpp>
 31 | 
 32 | #include <pthread.h>
 33 | 
 34 | #ifdef AL_HAS_NVPROF
 35 | #include <nvToolsExtCuda.h>
 36 | #include <nvToolsExtCudaRt.h>
 37 | #endif
 38 | 
 39 | namespace Al {
 40 | namespace internal {
 41 | namespace profiling {
 42 | 
 43 | void name_thread([[maybe_unused]] std::thread::native_handle_type handle,
 44 |                  [[maybe_unused]] std::string name) {
 45 | #ifdef AL_HAS_NVPROF
 46 |   nvtxNameOsThreadA(handle, name.c_str());
 47 | #endif
 48 | #ifdef _GNU_SOURCE
 49 |   // Subtract 1 to account for the terminating null.
 50 |   std::string name_resized = name.substr(0, AL_MAX_THREAD_NAME_LEN - 1);
 51 |   pthread_setname_np(handle, name_resized.c_str());
 52 | #endif
 53 | }
 54 | 
 55 | #ifdef AL_HAS_CUDA
 56 | void name_stream(AlGpuStream_t stream, std::string name) {
 57 | #ifdef AL_HAS_NVPROF
 58 |   nvtxNameCudaStreamA(stream, name.c_str());
 59 | #else
 60 |   (void) stream;
 61 |   (void) name;
 62 | #endif
 63 | }
 64 | #endif
 65 | 
 66 | void mark(std::string desc) {
 67 |   (void) desc;
 68 | #ifdef AL_HAS_NVPROF
 69 |   nvtxMarkA(desc.c_str());
 70 | #endif
 71 | #ifdef AL_HAS_ROCTRACER
 72 |   roctxMark(desc.c_str());
 73 | #endif
 74 | }
 75 | 
 76 | ProfileRange prof_start(std::string name) {
 77 |   (void) name;
 78 |   ProfileRange range;
 79 | #ifdef AL_HAS_NVPROF
 80 |   range.nvtx_range = nvtxRangeStartA(name.c_str());
 81 | #endif
 82 | #ifdef AL_HAS_ROCTRACER
 83 |   range.roctx_range = roctxRangeStart(name.c_str());
 84 | #endif
 85 |   return range;
 86 | }
 87 | 
 88 | void prof_end(ProfileRange range) {
 89 |   (void) range;
 90 | #ifdef AL_HAS_NVPROF
 91 |   nvtxRangeEnd(range.nvtx_range);
 92 | #endif
 93 | #ifdef AL_HAS_ROCTRACER
 94 |   roctxRangeStop(range.roctx_range);
 95 | #endif
 96 | }
 97 | 
 98 | }  // namespace profiling
 99 | }  // namespace internal
100 | }  // namespace Al
101 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/reduce_scatter.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | template <typename T>
39 | class ReduceScatterAlState : public HostTransferCollectiveSignalAtEndState {
40 | public:
41 |   ReduceScatterAlState(const T* sendbuf, T* recvbuf, size_t count_,
42 |                        ReductionOperator op_, HostTransferCommunicator& comm_,
43 |                        AlGpuStream_t stream_) :
44 |     HostTransferCollectiveSignalAtEndState(stream_),
45 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(comm_.size()*count_)),
46 |     count(count_),
47 |     op(mpi::ReductionOperator2MPI_Op<T>(op_)),
48 |     comm(comm_.get_comm()) {
49 |     // Transfer data from device to host.
50 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count*comm_.size(),
51 |                                   AlGpuMemcpyDeviceToHost, stream_));
52 |     start_event.record(stream_);
53 | 
54 |     // Have the device wait on the host.
55 |     gpu_wait.wait(stream_);
56 | 
57 |     // Transfer completed buffer back to device.
58 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count,
59 |                                   AlGpuMemcpyHostToDevice, stream_));
60 |     end_event.record(stream_);
61 |   }
62 | 
63 |   ~ReduceScatterAlState() override {
64 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
65 |   }
66 | 
67 |   std::string get_name() const override { return "HTReduceScatter"; }
68 | 
69 | protected:
70 |   void start_mpi_op() override {
71 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter_block)(
72 |       MPI_IN_PLACE, host_mem, count,
73 |       mpi::TypeMap<T>(), op, comm, get_mpi_req());
74 |   }
75 | 
76 | private:
77 |   T* host_mem;
78 |   size_t count;
79 |   MPI_Op op;
80 |   MPI_Comm comm;
81 | };
82 | 
83 | }  // namespace ht
84 | }  // namespace internal
85 | }  // namespace Al
86 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/allreduce.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | template <typename T>
40 | void passthrough_allreduce(const T* sendbuf, T* recvbuf, size_t count,
41 |                            ReductionOperator op, MPICommunicator& comm) {
42 |   AL_MPI_LARGE_COUNT_CALL(MPI_Allreduce)(
43 |     buf_or_inplace(sendbuf), recvbuf, count, TypeMap<T>(),
44 |     ReductionOperator2MPI_Op<T>(op), comm.get_comm());
45 | }
46 | 
47 | template <typename T>
48 | class AllreduceAlState : public MPIState {
49 | public:
50 |   AllreduceAlState(const T* sendbuf_, T* recvbuf_, size_t count_,
51 |                    ReductionOperator op_, MPICommunicator& comm_, AlMPIReq req_) :
52 |     MPIState(req_),
53 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_),
54 |     op(ReductionOperator2MPI_Op<T>(op_)), comm(comm_.get_comm()) {}
55 | 
56 |   ~AllreduceAlState() override {}
57 | 
58 |   std::string get_name() const override { return "MPIAllreduce"; }
59 | 
60 | protected:
61 |   void start_mpi_op() override {
62 |     AL_MPI_LARGE_COUNT_CALL(MPI_Iallreduce)(
63 |       buf_or_inplace(sendbuf), recvbuf, count, TypeMap<T>(), op,
64 |       comm, get_mpi_req());
65 |   }
66 | 
67 | private:
68 |   const T* sendbuf;
69 |   T* recvbuf;
70 |   size_t count;
71 |   MPI_Op op;
72 |   MPI_Comm comm;
73 | };
74 | 
75 | template <typename T>
76 | void passthrough_nb_allreduce(const T* sendbuf, T* recvbuf, size_t count,
77 |                               ReductionOperator op, MPICommunicator& comm,
78 |                               AlMPIReq& req) {
79 |   req = get_free_request();
80 |   internal::mpi::AllreduceAlState<T>* state =
81 |     new internal::mpi::AllreduceAlState<T>(
82 |       sendbuf, recvbuf, count, op, comm, req);
83 |   get_progress_engine()->enqueue(state);
84 | }
85 | 
86 | } // namespace mpi
87 | } // namespace internal
88 | } // namespace Al
89 | 


--------------------------------------------------------------------------------
/cmake/AluminumConfig.cmake.in:
--------------------------------------------------------------------------------
  1 | # Aluminum currently has 4 known components: MPI, NCCL, HOST_TRANSFER,
  2 | # and MPI_CUDA. "MPI" is always available. The others are found if
  3 | # AL_HAS_<COMP>.
  4 | include(CMakeFindDependencyMacro)
  5 | 
  6 | list(APPEND CMAKE_MODULE_PATH "@CMAKE_MODULE_LOCATION@")
  7 | 
  8 | include(${CMAKE_CURRENT_LIST_DIR}/AluminumConfigVersion.cmake)
  9 | set(ALUMINUM_VERSION ${PACKAGE_VERSION})
 10 | 
 11 | set(_AL_KNOWN_COMPONENTS
 12 |   MPI
 13 |   NCCL
 14 |   HOST_TRANSFER
 15 |   MPI_CUDA)
 16 | 
 17 | set(AL_HAS_CALIPER @AL_HAS_CALIPER@)
 18 | set(AL_HAS_CUDA @AL_HAS_CUDA@)
 19 | set(AL_HAS_ROCM @AL_HAS_ROCM@)
 20 | set(AL_HAS_MPI_CUDA @AL_HAS_MPI_CUDA@)
 21 | set(AL_HAS_HOST_TRANSFER @AL_HAS_HOST_TRANSFER@)
 22 | set(AL_HAS_NCCL @AL_HAS_NCCL@)
 23 | set(AL_HAS_ROCTRACER @AL_HAS_ROCTRACER@)
 24 | 
 25 | set(MPI_CXX_COMPILER "@MPI_CXX_COMPILER@" CACHE FILEPATH
 26 |   "The MPI CXX compiler wrapper.")
 27 | find_package(MPI 3.0 REQUIRED COMPONENTS CXX)
 28 | 
 29 | set(AL_USE_HWLOC @AL_USE_HWLOC@)
 30 | if (AL_USE_HWLOC)
 31 |   find_dependency(HWLOC)
 32 | endif ()
 33 | find_dependency(Threads)
 34 | 
 35 | if (AL_HAS_CALIPER)
 36 |   find_dependency(caliper)
 37 | endif ()
 38 | 
 39 | if (AL_HAS_ROCM)
 40 |   # The API for Aluminum does not require that HIP language support
 41 |   # be enabled; it only requires the host/device libraries be found.
 42 |   find_dependency(hip)
 43 |   find_dependency(hipcub)
 44 |   find_dependency(rocm_smi)
 45 | 
 46 |   set(AL_ROCM_PATH "@AL_ROCM_PATH@")
 47 | 
 48 |   if (AL_HAS_NCCL)
 49 |     find_dependency(rccl)
 50 |   endif (AL_HAS_NCCL)
 51 | 
 52 |   if (AL_HAS_ROCTRACER)
 53 |     find_dependency(Roctracer COMPONENTS roctx)
 54 |   endif ()
 55 | elseif (AL_HAS_CUDA)
 56 | 
 57 |   # The API for Aluminum does not require that CUDA language support
 58 |   # be enabled; it only requires the CUDAToolkit.
 59 |   find_dependency(CUDAToolkit)
 60 | 
 61 |   if (AL_HAS_NCCL)
 62 |     find_dependency(NCCL)
 63 |   endif (AL_HAS_NCCL)
 64 | 
 65 |   set(AL_HAS_NVPROF @AL_HAS_NVPROF@)
 66 |   if (AL_HAS_NVPROF)
 67 |     set(AL_HAS_EXTERNAL_NVTX @AL_HAS_EXTERNAL_NVTX@)
 68 |     if (AL_HAS_EXTERNAL_NVTX)
 69 |       find_dependency(NVTX)
 70 |     endif ()
 71 |   endif ()
 72 | 
 73 |   # Because of their templated nature, the CUB-based memory allocator
 74 |   # is public. Therefore, this dependency is public and must be met
 75 |   # downstream.
 76 |   set(AL_HAS_EXTERNAL_CUB @AL_HAS_EXTERNAL_CUB@)
 77 |   if (AL_HAS_EXTERNAL_CUB)
 78 |     find_dependency(CUB)
 79 |   endif ()
 80 | endif (AL_HAS_ROCM)
 81 | 
 82 | @PACKAGE_INIT@
 83 | 
 84 | set(_TMP_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIRS@")
 85 | foreach (_DIR ${_TMP_INCLUDE_DIRS})
 86 |   set_and_check(_INCLUDE_DIR "${_DIR}")
 87 |   list(APPEND ALUMINUM_INCLUDE_DIRS "${_INCLUDE_DIR}")
 88 | endforeach (_DIR "${_TMP_INCLUDE_DIRS}")
 89 | 
 90 | set(_TMP_LIBRARY_DIRS "@PACKAGE_LIB_INSTALL_DIR@")
 91 | foreach (_DIR ${_TMP_LIBRARY_DIRS})
 92 |   set_and_check(_LIBRARY_DIR "${_DIR}")
 93 |   list(APPEND ALUMINUM_LIBRARY_DIRS "${_LIBRARY_DIR}")
 94 | endforeach (_DIR ${_TMP_LIBRARY_DIRS})
 95 | 
 96 | if (NOT TARGET AL::Al)
 97 |   include(${CMAKE_CURRENT_LIST_DIR}/AluminumTargets.cmake)
 98 | endif ()
 99 | 
100 | foreach (comp ${_AL_KNOWN_COMPONENTS})
101 |   if (AL_HAS_${comp})
102 |     set(Aluminum_${comp}_FOUND 1)
103 |   endif ()
104 | endforeach ()
105 | 
106 | check_required_components(Aluminum)
107 | 
108 | set(Aluminum_LIBRARIES AL::Al)
109 | 


--------------------------------------------------------------------------------
/src/cuda/cuda.cpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #include "aluminum/cuda/cuda.hpp"
29 | 
30 | #include <cstdint>
31 | #include <vector>
32 | 
33 | #include "aluminum/cuda/sync_memory.hpp"
34 | #include "aluminum/cuda/events.hpp"
35 | #include "aluminum/cuda/streams.hpp"
36 | #include "aluminum/utils/locked_resource_pool.hpp"
37 | #include "aluminum/tuning_params.hpp"
38 | 
39 | namespace Al {
40 | namespace internal {
41 | namespace cuda {
42 | 
43 | // Define resource pools.
44 | Al::internal::LockedResourcePool<int32_t*, CacheLinePinnedMemoryAllocator> sync_pool;
45 | Al::internal::LockedResourcePool<AlGpuEvent_t, CUDAEventAllocator> event_pool;
46 | 
47 | namespace {
48 | // Whether stream memory operations are supported.
49 | bool stream_mem_ops_supported = false;
50 | }
51 | 
52 | void init(int&, char**&) {
53 |   // Initialize internal streams.
54 |   stream_pool.allocate(AL_CUDA_STREAM_POOL_SIZE);
55 |   // Check whether stream memory operations are supported.
56 |   int attr = 0;
57 | #if defined AL_HAS_ROCM
58 |   int dev;
59 |   AL_CHECK_CUDA(hipGetDevice(&dev));
60 |   AL_CHECK_CUDA(hipDeviceGetAttribute(
61 |                     &attr, hipDeviceAttributeCanUseStreamWaitValue, dev));
62 | #elif defined AL_HAS_CUDA
63 |   // There was an API change to these in CUDA 11.7, and the flag to check
64 |   // for support changed (to have _V1) in CUDA 12. But as of CUDA 12,
65 |   // these are enabled by default, so we do not need to check.
66 | #if CUDA_VERSION >= 12000
67 |   attr = 1;
68 | #else
69 |   CUdevice dev;
70 |   AL_CHECK_CUDA_DRV(cuCtxGetDevice(&dev));
71 |   AL_CHECK_CUDA_DRV(cuDeviceGetAttribute(
72 |                       &attr, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev));
73 | #endif
74 | #endif
75 |   stream_mem_ops_supported = attr;
76 |   // Preallocate memory for synchronization operations.
77 |   sync_pool.preallocate(AL_SYNC_MEM_PREALLOC);
78 | }
79 | 
80 | void finalize() {
81 |   sync_pool.clear();
82 |   event_pool.clear();
83 |   stream_pool.clear();
84 | }
85 | 
86 | bool stream_memory_operations_supported() {
87 |   return stream_mem_ops_supported;
88 | }
89 | 
90 | }  // namespace cuda
91 | }  // namespace internal
92 | }  // namespace Al
93 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_events.cpp:
--------------------------------------------------------------------------------
  1 | /** Benchmark different event implementations. */
  2 | 
  3 | #include <iostream>
  4 | 
  5 | #include "Al.hpp"
  6 | #include "benchmark_utils.hpp"
  7 | #include "wait.hpp"
  8 | 
  9 | #if defined AL_HAS_ROCM
 10 | #include <hip/hip_runtime.h>
 11 | #elif defined AL_HAS_CUDA
 12 | #include <cuda.h>
 13 | #endif
 14 | 
 15 | class Event {
 16 | public:
 17 |   virtual void record(AlGpuStream_t stream) = 0;
 18 |   virtual bool query() = 0;
 19 | };
 20 | 
 21 | class CudaEvent : public Event {
 22 | public:
 23 |   CudaEvent() {
 24 |     AL_CHECK_CUDA(
 25 |       AlGpuEventCreateWithFlags(&event, AlGpuNoTimingEventFlags));
 26 |   }
 27 |   ~CudaEvent() {
 28 |     AL_IGNORE_NODISCARD(AlGpuEventDestroy(event));
 29 |   }
 30 |   void record(AlGpuStream_t stream) override {
 31 |     AL_CHECK_CUDA(AlGpuEventRecord(event, stream));
 32 |   }
 33 |   bool query() override {
 34 |     return AlGpuEventQuery(event) == AlGpuSuccess;
 35 |   }
 36 | private:
 37 |   AlGpuEvent_t event;
 38 | };
 39 | 
 40 | class CustomEvent : public Event {
 41 | public:
 42 |   CustomEvent() {
 43 |     AL_CHECK_CUDA(AlGpuMallocHost(&event, sizeof(int32_t)));
 44 |     __atomic_store_n(event, 1, __ATOMIC_SEQ_CST);
 45 | #if defined AL_HAS_ROCM
 46 |     AL_CHECK_CUDA(hipHostGetDevicePointer(&dev_ptr, event, 0));
 47 | #elif defined AL_HAS_CUDA
 48 |     AL_CHECK_CUDA_DRV(cuMemHostGetDevicePointer(&dev_ptr, event, 0));
 49 | #endif
 50 |   }
 51 |   ~CustomEvent() {
 52 |     AL_IGNORE_NODISCARD(AlGpuFreeHost(event));
 53 |   }
 54 |   void record(AlGpuStream_t stream) override {
 55 |     __atomic_store_n(event, 0, __ATOMIC_SEQ_CST);
 56 | #if defined AL_HAS_ROCM
 57 |     AL_CHECK_CUDA(
 58 |       hipStreamWriteValue32(stream, dev_ptr, 1, 0));
 59 | #elif defined AL_HAS_CUDA
 60 |     AL_CHECK_CUDA_DRV(
 61 |       cuStreamWriteValue32(stream, dev_ptr, 1, CU_STREAM_WRITE_VALUE_DEFAULT));
 62 | #endif
 63 |   }
 64 |   bool query() override {
 65 |     return __atomic_load_n(event, __ATOMIC_SEQ_CST);
 66 |   }
 67 | private:
 68 |   int32_t* event __attribute__((aligned(64)));
 69 | #if defined AL_HAS_ROCM
 70 |   hipDeviceptr_t dev_ptr;
 71 | #elif defined AL_HAS_CUDA
 72 |   CUdeviceptr dev_ptr;
 73 | #endif
 74 | };
 75 | 
 76 | void do_benchmark(AlGpuStream_t stream, Event& event) {
 77 |   const double wait_time = 0.0001;
 78 |   std::vector<double> times, launch_times;
 79 |   for (int i = 0; i < 100000; ++i) {
 80 |     double launch_start = Al::get_time();
 81 |     gpu_wait(wait_time, stream);
 82 |     event.record(stream);
 83 |     double start = Al::get_time();
 84 |     while (!event.query()) {}
 85 |     double end = Al::get_time();
 86 |     launch_times.push_back(start - launch_start);
 87 |     times.push_back(end - start);
 88 |     AL_CHECK_CUDA(AlGpuStreamSynchronize(stream));
 89 |   }
 90 |   std::cout << "Launch: " << SummaryStats(launch_times) << std::endl;
 91 |   std::cout << "Query: " << SummaryStats(times) << std::endl;
 92 | }
 93 | 
 94 | int main(int, char**) {
 95 |   AL_CHECK_CUDA(AlGpuSetDevice(0));
 96 |   AlGpuStream_t stream;
 97 |   AL_CHECK_CUDA(AlGpuStreamCreate(&stream));
 98 |   {
 99 |     CudaEvent cuda_event;
100 |     CustomEvent custom_event;
101 |     std::cout << "Custom event:" << std::endl;
102 |     do_benchmark(stream, custom_event);
103 |     std::cout << "CUDA Event:" << std::endl;
104 |     do_benchmark(stream, cuda_event);
105 |   }
106 |   AL_CHECK_CUDA(AlGpuStreamSynchronize(stream));
107 |   AL_CHECK_CUDA(AlGpuStreamDestroy(stream));
108 |   return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/allreduce.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | template <typename T>
39 | class AllreduceAlState : public HostTransferCollectiveSignalAtEndState {
40 |  public:
41 |   AllreduceAlState(const T* sendbuf, T* recvbuf, size_t count_,
42 |                    ReductionOperator op_, HostTransferCommunicator& comm_,
43 |                    AlGpuStream_t stream_) :
44 |     HostTransferCollectiveSignalAtEndState(stream_),
45 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(count_)),
46 |     count(count_),
47 |     op(mpi::ReductionOperator2MPI_Op<T>(op_)),
48 |     comm(comm_.get_comm()) {
49 |     // Transfer data from device to host.
50 |     if (sendbuf != recvbuf) {
51 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count,
52 |                                     AlGpuMemcpyDeviceToHost, stream_));
53 |     } else {
54 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, recvbuf, sizeof(T)*count,
55 |                                     AlGpuMemcpyDeviceToHost, stream_));
56 |     }
57 |     start_event.record(stream_);
58 | 
59 |     // Have the device wait on the host.
60 |     gpu_wait.wait(stream_);
61 | 
62 |     // Transfer completed buffer back to device.
63 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count,
64 |                                   AlGpuMemcpyHostToDevice, stream_));
65 |     end_event.record(stream_);
66 |   }
67 | 
68 |   ~AllreduceAlState() {
69 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
70 |   }
71 | 
72 |   std::string get_name() const override { return "HTAllreduce"; }
73 | 
74 |  protected:
75 |   void start_mpi_op() override {
76 |     AL_MPI_LARGE_COUNT_CALL(MPI_Iallreduce)(
77 |       MPI_IN_PLACE, host_mem, count, mpi::TypeMap<T>(),
78 |       op, comm, get_mpi_req());
79 |   }
80 | 
81 |  private:
82 |   T* host_mem;
83 |   size_t count;
84 |   MPI_Op op;
85 |   MPI_Comm comm;
86 | };
87 | 
88 | } // namespace ht
89 | } // namespace internal
90 | } // namespace Al
91 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/reduce_scatter.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | template <typename T>
40 | void passthrough_reduce_scatter(const T* sendbuf, T* recvbuf, size_t count,
41 |                                 ReductionOperator op, MPICommunicator& comm) {
42 |   AL_MPI_LARGE_COUNT_CALL(MPI_Reduce_scatter_block)(
43 |     buf_or_inplace(sendbuf), recvbuf, count,
44 |     TypeMap<T>(), ReductionOperator2MPI_Op<T>(op),
45 |     comm.get_comm());
46 | }
47 | 
48 | template <typename T>
49 | class ReduceScatterAlState : public MPIState {
50 | public:
51 |   ReduceScatterAlState(const T* sendbuf_, T* recvbuf_, size_t count_,
52 |                        ReductionOperator op_, MPICommunicator& comm_,
53 |                        AlMPIReq req_) :
54 |     MPIState(req_),
55 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_),
56 |     op(ReductionOperator2MPI_Op<T>(op_)),
57 |     comm(comm_.get_comm()) {}
58 | 
59 |   ~ReduceScatterAlState() override {}
60 | 
61 |   std::string get_name() const override { return "MPIReduceScatter"; }
62 | 
63 | protected:
64 |   void start_mpi_op() override {
65 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter_block)(
66 |       buf_or_inplace(sendbuf), recvbuf, count,
67 |       TypeMap<T>(), op, comm, get_mpi_req());
68 |   }
69 | 
70 | private:
71 |   const T* sendbuf;
72 |   T* recvbuf;
73 |   size_t count;
74 |   MPI_Op op;
75 |   MPI_Comm comm;
76 | };
77 | 
78 | template <typename T>
79 | void passthrough_nb_reduce_scatter(const T* sendbuf, T* recvbuf, size_t count,
80 |                                    ReductionOperator op, MPICommunicator& comm,
81 |                                    AlMPIReq& req) {
82 |   req = get_free_request();
83 |   internal::mpi::ReduceScatterAlState<T>* state =
84 |     new internal::mpi::ReduceScatterAlState<T>(
85 |       sendbuf, recvbuf, count, op, comm, req);
86 |   get_progress_engine()->enqueue(state);
87 | }
88 | 
89 | }  // namespace mpi
90 | }  // namespace internal
91 | }  // namespace Al
92 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/reduce.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | template <typename T>
39 | class ReduceAlState : public HostTransferCollectiveSignalNonRootEarlyState {
40 | public:
41 |   ReduceAlState(const T* sendbuf, T* recvbuf, size_t count_, ReductionOperator op_,
42 |                 int root_, HostTransferCommunicator& comm_, AlGpuStream_t stream_) :
43 |     HostTransferCollectiveSignalNonRootEarlyState(comm_.rank() == root_, stream_),
44 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(count_)),
45 |     count(count_),
46 |     root(root_),
47 |     op(mpi::ReductionOperator2MPI_Op<T>(op_)),
48 |     comm(comm_.get_comm()) {
49 |     // Transfer data from device to host.
50 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*count,
51 |                                   AlGpuMemcpyDeviceToHost, stream_));
52 |     start_event.record(stream_);
53 | 
54 |     // Have the device wait on the host.
55 |     gpu_wait.wait(stream_);
56 | 
57 |     if (is_root) {
58 |       // Transfer completed buffer back to device.
59 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem, sizeof(T)*count,
60 |                                     AlGpuMemcpyHostToDevice, stream_));
61 |     }
62 |     end_event.record(stream_);
63 |   }
64 | 
65 |   ~ReduceAlState() override {
66 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
67 |   }
68 | 
69 |   std::string get_name() const override { return "HTReduce"; }
70 | 
71 | protected:
72 |   void start_mpi_op() override {
73 |     if (is_root) {
74 |       AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce)(
75 |         MPI_IN_PLACE, host_mem, count, mpi::TypeMap<T>(),
76 |         op, root, comm, get_mpi_req());
77 |     } else {
78 |       AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce)(
79 |         host_mem, host_mem, count, mpi::TypeMap<T>(),
80 |         op, root, comm, get_mpi_req());
81 |     }
82 |   }
83 | 
84 | private:
85 |   T* host_mem;
86 |   size_t count;
87 |   int root;
88 |   MPI_Op op;
89 |   MPI_Comm comm;
90 | };
91 | 
92 | }  // namespace ht
93 | }  // namespace internal
94 | }  // namespace Al
95 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_waits.cpp:
--------------------------------------------------------------------------------
  1 | /** Benchmark different wait implementations. */
  2 | 
  3 | #include <iostream>
  4 | #include "Al.hpp"
  5 | #include "aluminum/cuda/helper_kernels.hpp"
  6 | #include "benchmark_utils.hpp"
  7 | #include "wait.hpp"
  8 | 
  9 | #if defined AL_HAS_ROCM
 10 | #include <hip/hip_runtime.h>
 11 | #elif defined AL_HAS_CUDA
 12 | #include <cuda.h>
 13 | #endif
 14 | 
 15 | class Wait {
 16 | public:
 17 |   Wait() {
 18 |     AL_CHECK_CUDA(AlGpuMallocHost(&wait_sync, sizeof(int32_t)));
 19 |     __atomic_store_n(wait_sync, 0, __ATOMIC_SEQ_CST);
 20 |   }
 21 |   ~Wait() {
 22 |     AL_IGNORE_NODISCARD(AlGpuFreeHost(wait_sync));
 23 |   }
 24 |   virtual void wait(AlGpuStream_t stream) = 0;
 25 |   virtual void signal() {
 26 |     __atomic_store_n(wait_sync, 1, __ATOMIC_SEQ_CST);
 27 |   }
 28 | 
 29 |   int32_t* wait_sync __attribute__((aligned(64)));
 30 | };
 31 | 
 32 | #if defined AL_HAS_ROCM
 33 | class StreamOpWait : public Wait {
 34 | public:
 35 |   StreamOpWait() : Wait() {
 36 |     AL_CHECK_CUDA(hipHostGetDevicePointer(&dev_ptr, wait_sync, 0));
 37 |   }
 38 |   ~StreamOpWait() {}
 39 |   void wait(AlGpuStream_t stream) override {
 40 |     Al::internal::cuda::launch_wait_kernel(stream, 1, dev_ptr);
 41 |   }
 42 |   hipDeviceptr_t dev_ptr;
 43 | };
 44 | #elif defined AL_HAS_CUDA
 45 | class StreamOpWait : public Wait {
 46 | public:
 47 |   StreamOpWait() : Wait() {
 48 |     AL_CHECK_CUDA_DRV(cuMemHostGetDevicePointer(&dev_ptr, wait_sync, 0));
 49 |   }
 50 |   ~StreamOpWait() {}
 51 |   void wait(AlGpuStream_t stream) override {
 52 |     Al::internal::cuda::launch_wait_kernel(stream, 1, dev_ptr);
 53 |   }
 54 |   CUdeviceptr dev_ptr;
 55 | };
 56 | #endif
 57 | 
 58 | class KernelWait : public Wait {
 59 | public:
 60 |   KernelWait() : Wait() {
 61 |     AL_CHECK_CUDA(AlGpuHostGetDevicePointer(
 62 |                           reinterpret_cast<void **>(&dev_ptr), wait_sync, 0));
 63 |   }
 64 |   ~KernelWait() {}
 65 |   void wait(AlGpuStream_t stream) override {
 66 |     Al::internal::cuda::launch_wait_kernel(stream, 1, dev_ptr);
 67 |   }
 68 |   int32_t* dev_ptr __attribute__((aligned(64)));
 69 | };
 70 | 
 71 | void do_benchmark(AlGpuStream_t stream, Wait& wait) {
 72 |   AlGpuEvent_t e;
 73 |   AL_CHECK_CUDA(AlGpuEventCreateWithFlags(&e, AlGpuNoTimingEventFlags));
 74 |   std::vector<double> times, launch_times;
 75 |   for (int i = 0; i < 100000; ++i) {
 76 |     double launch_start = Al::get_time();
 77 |     wait.wait(stream);
 78 |     double launch_end = Al::get_time();
 79 |     AL_CHECK_CUDA(AlGpuEventRecord(e, stream));
 80 |     double start = Al::get_time();
 81 |     wait.signal();
 82 |     while (AlGpuEventQuery(e) == AlGpuErrorNotReady) {}
 83 |     double end = Al::get_time();
 84 |     launch_times.push_back(launch_end - launch_start);
 85 |     times.push_back(end - start);
 86 |     AL_CHECK_CUDA(AlGpuStreamSynchronize(stream));
 87 |   }
 88 |   std::cout << "Launch: " << SummaryStats(launch_times) << std::endl;
 89 |   std::cout << "Signal: " << SummaryStats(times) << std::endl;
 90 |   AL_CHECK_CUDA(AlGpuEventDestroy(e));
 91 | }
 92 | 
 93 | int main(int, char**) {
 94 |   AL_CHECK_CUDA(AlGpuSetDevice(0));
 95 |   AlGpuStream_t stream;
 96 |   AL_CHECK_CUDA(AlGpuStreamCreate(&stream));
 97 |   {
 98 |     StreamOpWait stream_op_wait;
 99 |     KernelWait kernel_wait;
100 |     std::cout << "StreamOp wait:" << std::endl;
101 |     do_benchmark(stream, stream_op_wait);
102 |     std::cout << "Kernel wait:" << std::endl;
103 |     do_benchmark(stream, kernel_wait);
104 |   }
105 |   AL_CHECK_CUDA(AlGpuStreamSynchronize(stream));
106 |   AL_CHECK_CUDA(AlGpuStreamDestroy(stream));
107 |   return 0;
108 | }
109 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/reduce_scatterv.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | #include <numeric> // std::accumulate
35 | 
36 | namespace Al {
37 | namespace internal {
38 | namespace ht {
39 | 
40 | template <typename T>
41 | class ReduceScattervAlState : public HostTransferCollectiveSignalAtEndState {
42 | public:
43 |   ReduceScattervAlState(const T* sendbuf, T* recvbuf,
44 |                         std::vector<size_t> counts_,
45 |                         ReductionOperator op_, HostTransferCommunicator& comm_,
46 |                         AlGpuStream_t stream_) :
47 |     HostTransferCollectiveSignalAtEndState(stream_),
48 |     total_size(std::accumulate(counts_.begin(), counts_.end(), size_t{0})),
49 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(total_size)),
50 |     counts(mpi::countify_size_t_vector(counts_)),
51 |     op(mpi::ReductionOperator2MPI_Op<T>(op_)),
52 |     comm(comm_.get_comm()) {
53 |     // Transfer data from device to host.
54 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem, sendbuf, sizeof(T)*total_size,
55 |                                   AlGpuMemcpyDeviceToHost, stream_));
56 |     start_event.record(stream_);
57 | 
58 |     // Have the device wait on the host.
59 |     gpu_wait.wait(stream_);
60 | 
61 |     // Transfer completed buffer back to device.
62 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem,
63 |                                   sizeof(T)*counts_[comm_.rank()],
64 |                                   AlGpuMemcpyHostToDevice, stream_));
65 |     end_event.record(stream_);
66 |   }
67 | 
68 |   ~ReduceScattervAlState() override {
69 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
70 |   }
71 | 
72 |   std::string get_name() const override { return "HTReduceScatterv"; }
73 | 
74 | protected:
75 |   void start_mpi_op() override {
76 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter)(
77 |       MPI_IN_PLACE, host_mem, counts.data(),
78 |       mpi::TypeMap<T>(), op, comm, get_mpi_req());
79 |   }
80 | 
81 | private:
82 |   size_t total_size;
83 |   T* host_mem;
84 |   mpi::Al_mpi_count_vector_t counts;
85 |   MPI_Op op;
86 |   MPI_Comm comm;
87 | };
88 | 
89 | }  // namespace ht
90 | }  // namespace internal
91 | }  // namespace Al
92 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/gather.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/progress.hpp"
31 | #include "aluminum/mpi/base_state.hpp"
32 | #include "aluminum/mpi/communicator.hpp"
33 | #include "aluminum/mpi/utils.hpp"
34 | 
35 | namespace Al {
36 | namespace internal {
37 | namespace mpi {
38 | 
39 | // Data is passed in recvbuf on non-root processes when in-place.
40 | template <typename T>
41 | void passthrough_gather(const T* sendbuf, T* recvbuf, size_t count, int root,
42 |                         MPICommunicator& comm) {
43 |   if (sendbuf == IN_PLACE<T>() && comm.rank() != root) {
44 |     sendbuf = recvbuf;
45 |   }
46 |   AL_MPI_LARGE_COUNT_CALL(MPI_Gather)(
47 |     buf_or_inplace(sendbuf), count, TypeMap<T>(),
48 |     recvbuf, count, TypeMap<T>(), root, comm.get_comm());
49 | }
50 | 
51 | template <typename T>
52 | class GatherAlState : public MPIState {
53 | public:
54 |   GatherAlState(const T* sendbuf_, T* recvbuf_, size_t count_, int root_,
55 |                 MPICommunicator& comm_, AlMPIReq req_) :
56 |     MPIState(req_),
57 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), root(root_),
58 |     comm(comm_.get_comm()), rank(comm_.rank()) {}
59 | 
60 |   ~GatherAlState() override {}
61 | 
62 |   std::string get_name() const override { return "MPIGather"; }
63 | 
64 | protected:
65 |   void start_mpi_op() override {
66 |     if (sendbuf == IN_PLACE<T>() && rank != root) {
67 |       sendbuf = recvbuf;
68 |     }
69 |     AL_MPI_LARGE_COUNT_CALL(MPI_Igather)(
70 |       buf_or_inplace(sendbuf), count, TypeMap<T>(),
71 |       recvbuf, count, TypeMap<T>(), root, comm, get_mpi_req());
72 |   }
73 | 
74 | private:
75 |   const T* sendbuf;
76 |   T* recvbuf;
77 |   size_t count;
78 |   int root;
79 |   MPI_Comm comm;
80 |   int rank;
81 | };
82 | 
83 | // Data is passed in recvbuf on non-root processes when in-place.
84 | template <typename T>
85 | void passthrough_nb_gather(const T* sendbuf, T* recvbuf, size_t count, int root,
86 |                            MPICommunicator& comm, AlMPIReq& req) {
87 |   req = get_free_request();
88 |   internal::mpi::GatherAlState<T>* state =
89 |     new internal::mpi::GatherAlState<T>(
90 |       sendbuf, recvbuf, count, root, comm, req);
91 |   get_progress_engine()->enqueue(state);
92 | }
93 | 
94 | }  // namespace mpi
95 | }  // namespace internal
96 | }  // namespace Al
97 | 


--------------------------------------------------------------------------------
/include/aluminum/ht/allgather.hpp:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | #pragma once
29 | 
30 | #include "aluminum/cuda/cuda.hpp"
31 | #include "aluminum/ht/communicator.hpp"
32 | #include "aluminum/ht/base_state.hpp"
33 | 
34 | namespace Al {
35 | namespace internal {
36 | namespace ht {
37 | 
38 | template <typename T>
39 | class AllgatherAlState : public HostTransferCollectiveSignalAtEndState {
40 | public:
41 |   AllgatherAlState(const T* sendbuf, T* recvbuf, size_t count_,
42 |                    HostTransferCommunicator& comm_, AlGpuStream_t stream_) :
43 |     HostTransferCollectiveSignalAtEndState(stream_),
44 |     host_mem(mempool.allocate<MemoryType::CUDA_PINNED_HOST, T>(comm_.size()*count_)),
45 |     count(count_),
46 |     comm(comm_.get_comm()) {
47 |     // Transfer data from device to host.
48 |     if (sendbuf == recvbuf) {
49 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem + comm_.rank()*count,
50 |                                     sendbuf + comm_.rank()*count,
51 |                                     sizeof(T)*count, AlGpuMemcpyDeviceToHost,
52 |                                     stream_));
53 |     } else {
54 |       AL_CHECK_CUDA(AlGpuMemcpyAsync(host_mem + comm_.rank()*count,
55 |                                     sendbuf, sizeof(T)*count,
56 |                                     AlGpuMemcpyDeviceToHost, stream_));
57 |     }
58 |     start_event.record(stream_);
59 | 
60 |     // Have the device wait on the host.
61 |     gpu_wait.wait(stream_);
62 | 
63 |     // Transfer completed buffer back to device.
64 |     AL_CHECK_CUDA(AlGpuMemcpyAsync(recvbuf, host_mem,
65 |                                   sizeof(T)*count_*comm_.size(),
66 |                                   AlGpuMemcpyHostToDevice, stream_));
67 |     end_event.record(stream_);
68 |   }
69 | 
70 |   ~AllgatherAlState() override {
71 |     mempool.release<MemoryType::CUDA_PINNED_HOST>(host_mem);
72 |   }
73 | 
74 |   std::string get_name() const override { return "HTAllgather"; }
75 | 
76 | protected:
77 |   void start_mpi_op() override {
78 |     AL_MPI_LARGE_COUNT_CALL(MPI_Iallgather)(
79 |       MPI_IN_PLACE, count, mpi::TypeMap<T>(),
80 |       host_mem, count, mpi::TypeMap<T>(), comm, get_mpi_req());
81 |   }
82 | 
83 | private:
84 |   T* host_mem;
85 |   size_t count;
86 |   MPI_Comm comm;
87 | };
88 | 
89 | }  // namespace ht
90 | }  // namespace internal
91 | }  // namespace Al
92 | 


--------------------------------------------------------------------------------
/include/aluminum/cuda/streams.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <Al_config.hpp>
 31 | 
 32 | #include <atomic>
 33 | #include <cstddef>
 34 | #include <cstdint>
 35 | #include <vector>
 36 | #include <functional>
 37 | 
 38 | #if defined AL_HAS_ROCM
 39 | #include <hip/hip_runtime.h>
 40 | #elif defined AL_HAS_CUDA
 41 | #include <cuda_runtime.h>
 42 | #endif
 43 | 
 44 | namespace Al {
 45 | namespace internal {
 46 | namespace cuda {
 47 | 
 48 | /**
 49 |  * Manages a set of CUDA streams, accessed in round-robin order.
 50 |  *
 51 |  * Streams are either default priority or high priority.
 52 |  *
 53 |  * It is safe for multiple threads to call get_stream concurrently.
 54 |  */
 55 | class StreamPool {
 56 | public:
 57 |   /** Create pool with num_streams default and high priority streams. */
 58 |   StreamPool(size_t num_streams = 0);
 59 |   ~StreamPool();
 60 | 
 61 |   /** Explicitly allocate streams. */
 62 |   void allocate(size_t num_streams);
 63 | 
 64 |   /** Delete all streams in the pool. */
 65 |   void clear();
 66 | 
 67 |   /** Return a default-priority CUDA stream. */
 68 |   AlGpuStream_t get_stream();
 69 | 
 70 |   /**
 71 |    * Return a high-priority CUDA stream.
 72 |    *
 73 |    * If high-priority streams are not supported, returns a default-priority
 74 |    * stream.
 75 |    */
 76 |   AlGpuStream_t get_high_priority_stream();
 77 | 
 78 |   /**
 79 |    * Replace all streams in the pool with streams from an external source.
 80 |    *
 81 |    * Streams provided this way will not be freed by Aluminum.
 82 |    *
 83 |    * @param stream_getter Return the next stream to use in the pool. This
 84 |    * may be called an arbitrary number of times. It takes a boolean argument
 85 |    * for whether to return a default (false) or high (true) priority stream.
 86 |    */
 87 |   void replace_streams(std::function<AlGpuStream_t(bool)> stream_getter);
 88 | 
 89 | private:
 90 |   std::vector<AlGpuStream_t> default_streams;
 91 |   std::atomic<uint32_t> default_idx{0};
 92 |   std::vector<AlGpuStream_t> high_priority_streams;
 93 |   std::atomic<uint32_t> high_priority_idx{0};
 94 |   /** Whether streams were replaced; we do not free these streams. */
 95 |   bool external_streams = false;
 96 | };
 97 | 
 98 | /** Default internal stream pool for Aluminum. */
 99 | extern StreamPool stream_pool;
100 | 
101 | }  // namespace cuda
102 | }  // namespace internal
103 | }  // namespace Al
104 | 


--------------------------------------------------------------------------------
/src/trace.cpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #include <Al_config.hpp>
 29 | #include "aluminum/trace.hpp"
 30 | 
 31 | #include <unistd.h>
 32 | #include <limits.h>
 33 | 
 34 | #include <fstream>
 35 | #include <vector>
 36 | #ifdef AL_THREAD_MULTIPLE
 37 | #include <mutex>
 38 | #endif
 39 | 
 40 | #include "aluminum/state.hpp"
 41 | 
 42 | namespace Al {
 43 | namespace internal {
 44 | namespace trace {
 45 | 
 46 | namespace {
 47 | #ifdef AL_THREAD_MULTIPLE
 48 | std::mutex log_mutex;
 49 | #endif
 50 | std::vector<std::string> trace_log;
 51 | std::vector<std::string> pe_trace_log;
 52 | }
 53 | 
 54 | void save_trace_entry(std::string entry, bool progress) {
 55 | #ifdef AL_THREAD_MULTIPLE
 56 |   std::lock_guard<std::mutex> lock(log_mutex);
 57 | #endif
 58 |   if (progress) {
 59 |     pe_trace_log.push_back(std::move(entry));
 60 |   } else {
 61 |     trace_log.push_back(std::move(entry));
 62 |   }
 63 | }
 64 | 
 65 | void record_pe_start(const AlState& state) {
 66 | #ifdef AL_TRACE
 67 |   std::stringstream ss;
 68 |   ss << get_time() << ": PE START "
 69 |      << state.get_name() << " "
 70 |      << state.get_desc();
 71 |   save_trace_entry(ss.str(), true);
 72 | #else
 73 |   (void) state;
 74 | #endif
 75 | }
 76 | 
 77 | void record_pe_done(const AlState& state) {
 78 | #ifdef AL_TRACE
 79 |   std::stringstream ss;
 80 |   ss << get_time() << ": PE DONE "
 81 |      << state.get_name() << " "
 82 |      << state.get_desc();
 83 |   save_trace_entry(ss.str(), true);
 84 | #else
 85 |   (void) state;
 86 | #endif
 87 | }
 88 | 
 89 | std::ostream& write_trace_log(std::ostream& os) {
 90 | #ifdef AL_TRACE
 91 | #ifdef AL_THREAD_MULTIPLE
 92 |   std::lock_guard<std::mutex> lock(log_mutex);
 93 | #endif
 94 |   os << "Trace:\n";
 95 |   for (const auto& entry : trace_log) os << entry << "\n";
 96 |   os << "Progress engine trace:\n";
 97 |   for (const auto& entry : pe_trace_log) os << entry << "\n";
 98 |   return os;
 99 | #else
100 |   return os;
101 | #endif
102 | }
103 | 
104 | void write_trace_to_file() {
105 | #ifdef AL_TRACE
106 |   char hostname[HOST_NAME_MAX];
107 |   gethostname(hostname, HOST_NAME_MAX);
108 |   pid_t pid = getpid();
109 |   std::string filename = std::string(hostname) + "." + std::to_string(pid)
110 |     + ".trace.txt";
111 |   std::ofstream trace_file(filename);
112 |   write_trace_log(trace_file);
113 | #endif
114 | }
115 | 
116 | }  // namespace trace
117 | }  // namespace internal
118 | }  // namespace Al
119 | 


--------------------------------------------------------------------------------
/include/aluminum/utils/locked_resource_pool.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <Al_config.hpp>
 31 | 
 32 | #include <stddef.h>
 33 | #include <mutex>
 34 | #include <vector>
 35 | 
 36 | namespace Al {
 37 | namespace internal {
 38 | 
 39 | /**
 40 |  * Provides thread-safe access to a set of identical resources.
 41 |  *
 42 |  * These resources could be things like a fixed-size chunk of memory.
 43 |  *
 44 |  * ResourceAllocator must provide allocate and deallocate methods to
 45 |  * create and destroy (resp.) the managed resource (T).
 46 |  *
 47 |  * LockedResourcePool will guarantee that each instance of the
 48 |  * ResourceAllocator will be accessed by only a single thread at a
 49 |  * time. If additional locking is necessary for correctness, the
 50 |  * ResourceAllocator must provide it.
 51 |  */
 52 | template <typename T, typename ResourceAllocator>
 53 | class LockedResourcePool {
 54 | public:
 55 |   /** Initialize the resource pool. */
 56 |   LockedResourcePool(){};
 57 | 
 58 |   ~LockedResourcePool() {
 59 |     clear();
 60 |   }
 61 | 
 62 |   /** Preallocate this many instances of the resource. */
 63 |   void preallocate(size_t prealloc) {
 64 |     for (size_t i = 0; i < prealloc; ++i) {
 65 |       resources.push_back(allocator.allocate());
 66 |     }
 67 |   }
 68 | 
 69 |   /** Get one instance of the resource. */
 70 |   T get() {
 71 |     std::lock_guard<std::mutex> lg(lock);
 72 |     if (resources.empty()) {
 73 |       return allocator.allocate();
 74 |     } else {
 75 |       T resource = resources.back();
 76 |       resources.pop_back();
 77 |       return resource;
 78 |     }
 79 |   }
 80 | 
 81 |   /** Return an instance of the resource to the pool. */
 82 |   void release(T resource) {
 83 |     std::lock_guard<std::mutex> lg(lock);
 84 |     resources.push_back(resource);
 85 |   }
 86 | 
 87 |   /** Clear all instances left in the pool. */
 88 |   void clear() {
 89 |     std::lock_guard<std::mutex> lg(lock);
 90 |     for (auto&& resource : resources) {
 91 |       allocator.deallocate(resource);
 92 |     }
 93 |     resources.clear();
 94 |   }
 95 | 
 96 | private:
 97 |   /** Protects access to allocator and resource. */
 98 |   std::mutex lock;
 99 |   /** Allocator for the resource. */
100 |   ResourceAllocator allocator;
101 |   /** Currently available resources. */
102 |   std::vector<T> resources;
103 | };
104 | 
105 | }  // namespace internal
106 | }  // namespace Al
107 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/communicator.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <mpi.h>
 31 | #include "aluminum/mpi_comm_and_stream_wrapper.hpp"
 32 | 
 33 | namespace Al {
 34 | namespace internal {
 35 | namespace mpi {
 36 | 
 37 | int get_max_tag();
 38 | 
 39 | // Define the tag that point-to-point operations will use.
 40 | constexpr int pt2pt_tag = 2;
 41 | 
 42 | // Forward-declare.
 43 | class MPICommunicator;
 44 | 
 45 | /** Retrieve Aluminum's world MPI communicator. */
 46 | const MPICommunicator& get_world_comm();
 47 | 
 48 | /** Communicator for MPI-based operations. */
 49 | class MPICommunicator : public MPICommAndStreamWrapper<int> {
 50 |  public:
 51 |   /** Default constructor; use Aluminum's world. */
 52 |   MPICommunicator() : MPICommunicator(get_world_comm().get_comm()) {}
 53 |   /**
 54 |    * Use a particular MPI communicator and stream.
 55 |    *
 56 |    * The MPI backend currently ignores streams.
 57 |    */
 58 |   MPICommunicator(MPI_Comm comm_, int = 0) :
 59 |     MPICommAndStreamWrapper<int>(comm_, 0) {}
 60 |   /** Cannot copy this. */
 61 |   MPICommunicator(const MPICommunicator& other) = delete;
 62 |   /** Default move constructor. */
 63 |   MPICommunicator(MPICommunicator&& other) = default;
 64 |   /** Cannot copy this. */
 65 |   MPICommunicator& operator=(MPICommunicator& other) = delete;
 66 |   /** Default move assignment operator. */
 67 |   MPICommunicator& operator=(MPICommunicator&& other) = default;
 68 |   ~MPICommunicator() {}
 69 | 
 70 |   /** Create a new MPICommunicator with the same processes. */
 71 |   MPICommunicator copy(int stream = 0) const {
 72 |     return MPICommunicator(get_comm(), stream);
 73 |   }
 74 | 
 75 |   /**
 76 |    * Return the next free tag on this communicator.
 77 |    *
 78 |    * TODO: This is meant for internal use and should be moved / eliminted.
 79 |    */
 80 |   int get_free_tag() {
 81 |     int tag = free_tag++;
 82 |     if (free_tag >= internal::mpi::get_max_tag()
 83 |         || free_tag < starting_free_tag) {
 84 |       free_tag = starting_free_tag;
 85 |     }
 86 |     return tag;
 87 |   }
 88 | 
 89 |  private:
 90 |   /**
 91 |    * Starting tag to use for non-blocking operations.
 92 |    * No other operations should use any tag >= to this one.
 93 |    */
 94 |   static constexpr int starting_free_tag = 10;
 95 |   /** Free tag for communication. */
 96 |   int free_tag = starting_free_tag;
 97 | };
 98 | 
 99 | } // namespace mpi
100 | } // namespace internal
101 | } // namespace Al
102 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/scatter.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include "aluminum/progress.hpp"
 31 | #include "aluminum/mpi/base_state.hpp"
 32 | #include "aluminum/mpi/communicator.hpp"
 33 | #include "aluminum/mpi/utils.hpp"
 34 | 
 35 | namespace Al {
 36 | namespace internal {
 37 | namespace mpi {
 38 | 
 39 | // Data is passed in recvbuf on root processes when in-place.
 40 | template <typename T>
 41 | void passthrough_scatter(const T* sendbuf, T* recvbuf, size_t count, int root,
 42 |                          MPICommunicator& comm) {
 43 |   if (sendbuf == IN_PLACE<T>() && comm.rank() == root) {
 44 |     sendbuf = recvbuf;
 45 |     recvbuf = IN_PLACE<T>();
 46 |   }
 47 |   AL_MPI_LARGE_COUNT_CALL(MPI_Scatter)(
 48 |     sendbuf, count, TypeMap<T>(),
 49 |     buf_or_inplace(recvbuf), count, TypeMap<T>(),
 50 |     root, comm.get_comm());
 51 | }
 52 | 
 53 | template <typename T>
 54 | class ScatterAlState : public MPIState {
 55 | public:
 56 |   ScatterAlState(const T* sendbuf_, T* recvbuf_, size_t count_, int root_,
 57 |                  MPICommunicator& comm_, AlMPIReq req_) :
 58 |     MPIState(req_),
 59 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_), root(root_),
 60 |     comm(comm_.get_comm()), rank(comm_.rank()) {}
 61 | 
 62 |   ~ScatterAlState() override {}
 63 | 
 64 |   std::string get_name() const override { return "MPIScatter"; }
 65 | 
 66 | protected:
 67 |   void start_mpi_op() override {
 68 |     if (sendbuf == IN_PLACE<T>() && rank == root) {
 69 |       sendbuf = recvbuf;
 70 |       recvbuf = IN_PLACE<T>();
 71 |     }
 72 |     AL_MPI_LARGE_COUNT_CALL(MPI_Iscatter)(
 73 |       sendbuf, count, TypeMap<T>(),
 74 |       buf_or_inplace(recvbuf), count, TypeMap<T>(),
 75 |       root, comm, get_mpi_req());
 76 |   }
 77 | 
 78 | private:
 79 |   const T* sendbuf;
 80 |   T* recvbuf;
 81 |   size_t count;
 82 |   int root;
 83 |   MPI_Comm comm;
 84 |   int rank;
 85 | };
 86 | 
 87 | // When in-place, it is recvbuf that uses IN_PLACE.
 88 | template <typename T>
 89 | void passthrough_nb_scatter(const T* sendbuf, T* recvbuf, size_t count,
 90 |                             int root, MPICommunicator& comm, AlMPIReq& req) {
 91 |   req = get_free_request();
 92 |   internal::mpi::ScatterAlState<T>* state =
 93 |     new internal::mpi::ScatterAlState<T>(
 94 |       sendbuf, recvbuf, count, root, comm, req);
 95 |   get_progress_engine()->enqueue(state);
 96 | }
 97 | 
 98 | }  // namespace mpi
 99 | }  // namespace internal
100 | }  // namespace Al
101 | 


--------------------------------------------------------------------------------
/cmake/tuning_params.hpp.in:
--------------------------------------------------------------------------------
 1 | ////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
 3 | // Lawrence Livermore National Laboratory in collaboration with University of
 4 | // Illinois Urbana-Champaign.
 5 | //
 6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
 7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 8 | //
 9 | // LLNL-CODE-756777.
10 | // All rights reserved.
11 | //
12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
14 | //
15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
16 | // may not use this file except in compliance with the License.  You may
17 | // obtain a copy of the License at:
18 | //
19 | // http://www.apache.org/licenses/LICENSE-2.0
20 | //
21 | // Unless required by applicable law or agreed to in writing, software
22 | // distributed under the License is distributed on an "AS IS" BASIS,
23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 | // implied. See the License for the specific language governing
25 | // permissions and limitations under the license.
26 | ////////////////////////////////////////////////////////////////////////////////
27 | 
28 | /**
29 |  * These are used to tune various algorithmic choices.
30 |  * You should probably choose them based on benchmarks for your particular
31 |  * configuration.
32 |  */
33 | #pragma once
34 | 
35 | /** Number of concurrent operations the progress engine will perform. */
36 | #define AL_PE_NUM_CONCURRENT_OPS @AL_PE_NUM_CONCURRENT_OPS@
37 | /** Max number of streams the progress engine supports. */
38 | #define AL_PE_NUM_STREAMS @AL_PE_NUM_STREAMS@
39 | /** Max number of pipeline stages the progress engine supports. */
40 | #define AL_PE_NUM_PIPELINE_STAGES @AL_PE_NUM_PIPELINE_STAGES@
41 | /** Max number of entries in each stream's input queue. */
42 | #define AL_PE_INPUT_QUEUE_SIZE @AL_PE_INPUT_QUEUE_SIZE@
43 | /**
44 |  * Whether to have a default stream entry for the progress engine
45 |  * added automatically.
46 |  *
47 |  * This makes sense when using MPI, but not so when using the
48 |  * host-transfer backend, which does not use the default stream.
49 |  */
50 | #cmakedefine AL_PE_ADD_DEFAULT_STREAM
51 | /**
52 |  * Whether to use a thread-local cache to map streams to input queues
53 |  * for the progress engine.
54 |  *
55 |  * If you expect to have only a small number of streams, using a cache
56 |  * is unlikely to help, since searching it will take as long as
57 |  * searching the actual list.
58 |  */
59 | #cmakedefine AL_PE_STREAM_QUEUE_CACHE
60 | 
61 | /**
62 |  * Whether to delay starting the progress engine until it is actually
63 |  * needed. This results in a one-time penalty on the first call to an
64 |  * operation that uses the progress engine, but only a quick check
65 |  * thereafter.
66 |  */
67 | #cmakedefine AL_PE_START_ON_DEMAND
68 | 
69 | /** Amount of sync object memory to preallocate in the pool. */
70 | #define AL_SYNC_MEM_PREALLOC @AL_SYNC_MEM_PREALLOC@
71 | 
72 | /**
73 |  * Cache line size in bytes.
74 |  *
75 |  * On x86 this is usually 64. On POWER this is 128. On A64FX this is 256.
76 |  */
77 | #define AL_CACHE_LINE_SIZE @AL_CACHE_LINE_SIZE@
78 | 
79 | /**
80 |  * Minimum size in bytes to avoid destructive interference.
81 |  *
82 |  * This is generally AL_CACHE_LINE_SIZE, except on x86, where it should
83 |  * be twice the cache line size, because Intel processors can fetch
84 |  * two adjacent cache lines (see Intel Optimization Manual, 3.7.3).
85 |  */
86 | #define AL_DESTRUCTIVE_INTERFERENCE_SIZE @AL_DESTRUCTIVE_INTERFERENCE_SIZE@
87 | 
88 | /** Number of CUDA streams in the default stream pool. */
89 | #define AL_CUDA_STREAM_POOL_SIZE @AL_CUDA_STREAM_POOL_SIZE@
90 | 


--------------------------------------------------------------------------------
/include/aluminum/base.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <exception>
 31 | #include <iostream>
 32 | #include <sstream>
 33 | #include <string>
 34 | 
 35 | /** HOST_NAME_MAX is a linux only define */
 36 | #ifndef HOST_NAME_MAX
 37 | # if defined(_POSIX_HOST_NAME_MAX)
 38 | #  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 39 | # elif defined(MAXHOSTNAMELEN)
 40 | #  define HOST_NAME_MAX MAXHOSTNAMELEN
 41 | # endif
 42 | #endif /* HOST_NAME_MAX */
 43 | 
 44 | /** Intentionally ignore results of [[nodiscard]] functions. */
 45 | #define AL_IGNORE_NODISCARD(fcall) static_cast<void>((fcall))
 46 | 
 47 | namespace Al {
 48 | 
 49 | /**
 50 |  * Base Aluminum exception class.
 51 |  */
 52 | class al_exception : public std::exception {
 53 |  public:
 54 |   al_exception(const std::string m, const std::string f, const int l) :
 55 |     msg(m), file(f), line(l) {
 56 |     err = file + ":" + std::to_string(line) + " - " + msg;
 57 |   }
 58 |   const char* what() const noexcept override {
 59 |     return err.c_str();
 60 |   }
 61 | private:
 62 |   /** Exception message. */
 63 |   const std::string msg;
 64 |   /** File exception occurred in. */
 65 |   const std::string file;
 66 |   /** Line exception occurred at. */
 67 |   const int line;
 68 |   /** Constructed error message. */
 69 |   std::string err;
 70 | };
 71 | 
 72 | /**
 73 |  * Construct a single string from concatenating all arguments.
 74 |  *
 75 |  * Arguments must support operator<<.
 76 |  */
 77 | template <typename... Args>
 78 | std::string build_string(Args&&... args) {
 79 |   std::ostringstream oss;
 80 |   (oss << ... << args);
 81 |   return oss.str();
 82 | }
 83 | 
 84 | /** Throw an Aluminum excpetion. */
 85 | #define throw_al_exception(...) \
 86 |   throw Al::al_exception(Al::build_string(__VA_ARGS__), __FILE__, __LINE__)
 87 | 
 88 | /**
 89 |  * Output an error and then terminate Aluminum.
 90 |  *
 91 |  * This is primarily useful for handling errors in destructors.
 92 |  */
 93 | #define terminate_al(...)                       \
 94 |   do {                                          \
 95 |     std::cerr << __FILE__ << ":"                \
 96 |               << __LINE__ << " - "              \
 97 |               << Al::build_string(__VA_ARGS__)  \
 98 |               << std::endl;                     \
 99 |     std::terminate();                           \
100 |   } while (0)
101 | 
102 | /** Predefined reduction operations. */
103 | enum class ReductionOperator {
104 |   sum, prod, min, max, lor, land, lxor, bor, band, bxor, avg
105 | };
106 | 
107 | } // namespace Al
108 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/reduce.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include "aluminum/progress.hpp"
 31 | #include "aluminum/mpi/base_state.hpp"
 32 | #include "aluminum/mpi/communicator.hpp"
 33 | #include "aluminum/mpi/utils.hpp"
 34 | 
 35 | namespace Al {
 36 | namespace internal {
 37 | namespace mpi {
 38 | 
 39 | // Data is passed on recvbuf on non-root processes when in-place.
 40 | template <typename T>
 41 | void passthrough_reduce(const T* sendbuf, T* recvbuf, size_t count,
 42 |                         ReductionOperator op, int root, MPICommunicator& comm) {
 43 |   if (sendbuf == IN_PLACE<T>() && comm.rank() != root) {
 44 |     sendbuf = recvbuf;
 45 |   }
 46 |   AL_MPI_LARGE_COUNT_CALL(MPI_Reduce)(
 47 |     buf_or_inplace(sendbuf), recvbuf, count, TypeMap<T>(),
 48 |     ReductionOperator2MPI_Op<T>(op), root, comm.get_comm());
 49 | }
 50 | 
 51 | template <typename T>
 52 | class ReduceAlState : public MPIState {
 53 | public:
 54 |   ReduceAlState(const T* sendbuf_, T* recvbuf_, size_t count_, ReductionOperator op_,
 55 |                 int root_, MPICommunicator& comm_, AlMPIReq req_) :
 56 |     MPIState(req_),
 57 |     sendbuf(sendbuf_), recvbuf(recvbuf_), count(count_),
 58 |     op(ReductionOperator2MPI_Op<T>(op_)), root(root_),
 59 |     comm(comm_.get_comm()), rank(comm_.rank()) {}
 60 | 
 61 |   ~ReduceAlState() override {}
 62 | 
 63 |   std::string get_name() const override { return "MPIReduce"; }
 64 | 
 65 | protected:
 66 |   void start_mpi_op() override {
 67 |     if (sendbuf == IN_PLACE<T>() && rank != root) {
 68 |       sendbuf = recvbuf;
 69 |     }
 70 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce)(
 71 |       buf_or_inplace(sendbuf), recvbuf, count, TypeMap<T>(),
 72 |       op, root, comm, get_mpi_req());
 73 |   }
 74 | 
 75 | private:
 76 |   const T* sendbuf;
 77 |   T* recvbuf;
 78 |   size_t count;
 79 |   MPI_Op op;
 80 |   int root;
 81 |   MPI_Comm comm;
 82 |   int rank;
 83 | };
 84 | 
 85 | // Data is passed in recvbuf on non-root processes when in-place.
 86 | template <typename T>
 87 | void passthrough_nb_reduce(const T* sendbuf, T* recvbuf, size_t count,
 88 |                            ReductionOperator op, int root,
 89 |                            MPICommunicator& comm, AlMPIReq& req) {
 90 |   req = get_free_request();
 91 |   internal::mpi::ReduceAlState<T>* state =
 92 |     new internal::mpi::ReduceAlState<T>(
 93 |       sendbuf, recvbuf, count, op, root, comm, req);
 94 |   get_progress_engine()->enqueue(state);
 95 | }
 96 | 
 97 | }  // namespace mpi
 98 | }  // namespace internal
 99 | }  // namespace Al
100 | 


--------------------------------------------------------------------------------
/src/cuda/gpu_status_flag.cpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #include "aluminum/cuda/gpu_status_flag.hpp"
 29 | 
 30 | #include "aluminum/cuda/cuda.hpp"
 31 | #include "aluminum/cuda/sync_memory.hpp"
 32 | #include "aluminum/cuda/events.hpp"
 33 | 
 34 | namespace Al {
 35 | namespace internal {
 36 | namespace cuda {
 37 | 
 38 | GPUStatusFlag::GPUStatusFlag() {
 39 |   if (stream_memory_operations_supported()) {
 40 |     stream_mem.sync_event = sync_pool.get();
 41 |     // Initialize to completed to match CUDA event semantics.
 42 |     __atomic_store_n(stream_mem.sync_event, 1, __ATOMIC_SEQ_CST);
 43 | #if defined AL_HAS_ROCM
 44 |     AL_CHECK_CUDA(hipHostGetDevicePointer(
 45 |                     &stream_mem.sync_event_dev_ptr,
 46 |                     stream_mem.sync_event, 0));
 47 | #elif defined AL_HAS_CUDA
 48 |     AL_CHECK_CUDA_DRV(cuMemHostGetDevicePointer(
 49 |                         &stream_mem.sync_event_dev_ptr,
 50 |                         stream_mem.sync_event, 0));
 51 | #endif
 52 |   } else {
 53 |     plain_event = event_pool.get();
 54 |   }
 55 | }
 56 | 
 57 | GPUStatusFlag::~GPUStatusFlag() {
 58 |   if (stream_memory_operations_supported()) {
 59 |     sync_pool.release(stream_mem.sync_event);
 60 |   } else {
 61 |     event_pool.release(plain_event);
 62 |   }
 63 | }
 64 | 
 65 | void GPUStatusFlag::record(AlGpuStream_t stream) {
 66 |   if (stream_memory_operations_supported()) {
 67 |     // We cannot use std::atomic because we need the actual address of
 68 |     // the memory.
 69 |     __atomic_store_n(stream_mem.sync_event, 0, __ATOMIC_SEQ_CST);
 70 | #if defined AL_HAS_ROCM
 71 |     AL_CHECK_CUDA(hipStreamWriteValue32(
 72 |                     stream, stream_mem.sync_event_dev_ptr, 1,
 73 |                     0));
 74 | #elif defined AL_HAS_CUDA
 75 |     AL_CHECK_CUDA_DRV(cuStreamWriteValue32(
 76 |                         stream, stream_mem.sync_event_dev_ptr, 1,
 77 |                         CU_STREAM_WRITE_VALUE_DEFAULT));
 78 | #endif
 79 |   } else {
 80 |     AL_CHECK_CUDA(AlGpuEventRecord(plain_event, stream));
 81 |   }
 82 | }
 83 | 
 84 | bool GPUStatusFlag::query() {
 85 |   if (stream_memory_operations_supported()) {
 86 |     return __atomic_load_n(stream_mem.sync_event, __ATOMIC_SEQ_CST);
 87 |   } else {
 88 |     auto r = AlGpuEventQuery(plain_event);
 89 |     if (r == AlGpuSuccess) {
 90 |       return true;
 91 |     } else if (r != AlGpuErrorNotReady) {
 92 |       AL_CHECK_CUDA(r);
 93 |       return false;  // Never reached.
 94 |     } else {
 95 |       return false;
 96 |     }
 97 |   }
 98 | }
 99 | 
100 | }  // namespace cuda
101 | }  // namespace internal
102 | }  // namespace Al
103 | 


--------------------------------------------------------------------------------
/include/aluminum/mpi/reduce_scatterv.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include "aluminum/progress.hpp"
 31 | #include "aluminum/mpi/base_state.hpp"
 32 | #include "aluminum/mpi/communicator.hpp"
 33 | #include "aluminum/mpi/utils.hpp"
 34 | 
 35 | namespace Al {
 36 | namespace internal {
 37 | namespace mpi {
 38 | 
 39 | template <typename T>
 40 | void passthrough_reduce_scatterv(const T* sendbuf, T* recvbuf,
 41 |                                  std::vector<size_t> counts,
 42 |                                  ReductionOperator op,
 43 |                                  MPICommunicator& comm) {
 44 |   auto counts_ = countify_size_t_vector(counts);
 45 |   AL_MPI_LARGE_COUNT_CALL(MPI_Reduce_scatter)(
 46 |     buf_or_inplace(sendbuf), recvbuf,
 47 |     counts_.data(), TypeMap<T>(),
 48 |     ReductionOperator2MPI_Op<T>(op), comm.get_comm());
 49 | }
 50 | 
 51 | template <typename T>
 52 | class ReduceScattervAlState : public MPIState {
 53 | public:
 54 |   ReduceScattervAlState(const T* sendbuf_, T* recvbuf_,
 55 |                         std::vector<size_t> counts_,
 56 |                         ReductionOperator op_, MPICommunicator& comm_,
 57 |                         AlMPIReq req_) :
 58 |     MPIState(req_),
 59 |     sendbuf(sendbuf_), recvbuf(recvbuf_),
 60 |     counts(countify_size_t_vector(counts_)),
 61 |     op(ReductionOperator2MPI_Op<T>(op_)),
 62 |     comm(comm_.get_comm()) {}
 63 | 
 64 |   ~ReduceScattervAlState() override {}
 65 | 
 66 |   std::string get_name() const override { return "MPIReduceScatterv"; }
 67 | 
 68 | protected:
 69 |   void start_mpi_op() override {
 70 |     AL_MPI_LARGE_COUNT_CALL(MPI_Ireduce_scatter)(
 71 |       buf_or_inplace(sendbuf), recvbuf,
 72 |       counts.data(), TypeMap<T>(), op, comm,
 73 |       get_mpi_req());
 74 |   }
 75 | 
 76 | private:
 77 |   const T* sendbuf;
 78 |   T* recvbuf;
 79 |   Al_mpi_count_vector_t counts;
 80 |   MPI_Op op;
 81 |   MPI_Comm comm;
 82 | };
 83 | 
 84 | template <typename T>
 85 | void passthrough_nb_reduce_scatterv(const T* sendbuf, T* recvbuf,
 86 |                                     std::vector<size_t> counts,
 87 |                                     ReductionOperator op,
 88 |                                     MPICommunicator& comm,
 89 |                                     AlMPIReq& req) {
 90 |   req = get_free_request();
 91 |   internal::mpi::ReduceScattervAlState<T>* state =
 92 |     new internal::mpi::ReduceScattervAlState<T>(
 93 |       sendbuf, recvbuf, counts, op, comm, req);
 94 |   get_progress_engine()->enqueue(state);
 95 | }
 96 | 
 97 | }  // namespace mpi
 98 | }  // namespace internal
 99 | }  // namespace Al
100 | 


--------------------------------------------------------------------------------
/include/aluminum/trace.hpp:
--------------------------------------------------------------------------------
  1 | ////////////////////////////////////////////////////////////////////////////////
  2 | // Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
  3 | // Lawrence Livermore National Laboratory in collaboration with University of
  4 | // Illinois Urbana-Champaign.
  5 | //
  6 | // Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
  7 | // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
  8 | //
  9 | // LLNL-CODE-756777.
 10 | // All rights reserved.
 11 | //
 12 | // This file is part of Aluminum GPU-aware Communication Library. For details, see
 13 | // http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
 14 | //
 15 | // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 16 | // may not use this file except in compliance with the License.  You may
 17 | // obtain a copy of the License at:
 18 | //
 19 | // http://www.apache.org/licenses/LICENSE-2.0
 20 | //
 21 | // Unless required by applicable law or agreed to in writing, software
 22 | // distributed under the License is distributed on an "AS IS" BASIS,
 23 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 | // implied. See the License for the specific language governing
 25 | // permissions and limitations under the license.
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <cstddef>
 31 | #include <string>
 32 | #include <sstream>
 33 | #include <typeinfo>
 34 | #include <utility>
 35 | #include <vector>
 36 | 
 37 | #include <Al_config.hpp>
 38 | #include "aluminum/utils/utils.hpp"
 39 | 
 40 | namespace Al {
 41 | namespace internal {
 42 | // Forward declaration.
 43 | class AlState;
 44 | namespace trace {
 45 | 
 46 | #ifdef AL_TRACE
 47 | // Need to be able to print vectors.
 48 | template <typename T>
 49 | std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
 50 |   os << "[";
 51 |   if (!v.empty()) {
 52 |     for (size_t i = 0; i < v.size() - 1; ++i) {
 53 |       os << v[i] << ", ";
 54 |     }
 55 |     os << v[v.size() - 1];
 56 |   }
 57 |   os << "]";
 58 |   return os;
 59 | }
 60 | #endif
 61 | 
 62 | /**
 63 |  * Save entry to the trace log.
 64 |  * progress is whether this comes from the progress engine, which is recorded
 65 |  * separately.
 66 |  */
 67 | void save_trace_entry(std::string entry, bool progress = false);
 68 | 
 69 | /** Record an operation to the trace log. */
 70 | template <typename Backend, typename T, typename... Args>
 71 | #ifdef AL_TRACE
 72 | void record_op(std::string const& op,
 73 |                typename Backend::comm_type const& comm,
 74 |                Args&&... args) {
 75 |   std::ostringstream ss;
 76 |   ss << static_cast<size_t>(get_time()) << ": "
 77 |      << Backend::Name() << " "
 78 |      << comm.get_stream() << " "
 79 |      << typeid(T).name() << " "
 80 |      << op << " "
 81 |      << comm.rank() << " " << comm.size() << " ";
 82 | 
 83 |   // See:
 84 |   // https://stackoverflow.com/questions/27375089/what-is-the-easiest-way-to-print-a-variadic-parameter-pack-using-stdostream
 85 |   using expander = int[];
 86 |   (void) expander{0, (void(ss << " " << std::forward<Args>(args)), 0)...};
 87 |   save_trace_entry(ss.str(), false);
 88 | }
 89 | #else  // AL_TRACE
 90 | void record_op(std::string const&,
 91 |                typename Backend::comm_type const&,
 92 |                Args&&...) {
 93 | }
 94 | #endif  // AL_TRACE
 95 | 
 96 | /** Record a progress engine operation start to the trace log. */
 97 | void record_pe_start(const AlState& state);
 98 | /** Record a progress engine operation completion to the trace log. */
 99 | void record_pe_done(const AlState& state);
100 | 
101 | /** Write trace logs to os. */
102 | std::ostream& write_trace_log(std::ostream& os);
103 | /** Write trace logs to hostname.pid.trace.txt. */
104 | void write_trace_to_file();
105 | 
106 | }  // namespace trace
107 | }  // namespace internal
108 | }  // namespace Al
109 | 


--------------------------------------------------------------------------------