├── .gitmodules
├── CMakeLists.txt
├── CPPLINT.cfg
├── LICENSE
├── README.md
├── examples
    └── allreduce.cpp
├── ibcomm
    ├── allreduce_cpu_impl.h
    ├── allreduce_cuda_impl.h
    ├── allreduce_tester.cpp
    ├── ibverbs_communicator.cpp
    ├── ibverbs_communicator.h
    ├── ibverbs_communicator_cuda.cpp
    ├── memory_pool.cpp
    ├── memory_pool.h
    └── util.h
├── mpinvcc.sh
└── tests
    ├── allreduce_test.py
    ├── sendrecv_test.cpp
    └── unittest.cpp


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "grumpi"]
2 | 	path = grumpi
3 | 	url = https://github.com/keisukefukuda/grumpi.git
4 | [submodule "tinyexpr"]
5 | 	path = tinyexpr
6 | 	url = https://github.com/codeplea/tinyexpr.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | project("ibcomm")
  2 | 
  3 | # CMake version
  4 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
  5 | 
  6 | set(CMAKE_CXX_FLAGS "-Wno-missing-field-initializers -Wno-format-security -Wno-sign-compare")
  7 | set(CMAKE_CXX_FLAGS_RELEASE "-O2")
  8 | set(CMAKE_CXX_FLAGS_DEBUG "-g")
  9 | 
 10 | if (NOT CMAKE_BUILD_TYPE)
 11 |   message(STATUS "Setting CMAKE_BUILD_TYPE=Release (default)")
 12 |   set(CMAKE_BUILD_TYPE Release)
 13 | endif()
 14 | 
 15 | find_package(MPI)
 16 | find_package(CUDA)
 17 | 
 18 | # ibverbs
 19 | find_library(IBVERBS_LIBRARY
 20 |   NAMES ibverbs libibverbs libibverbs.so
 21 |   HINTS ENV LD_LIBRARY_PATH)
 22 | 
 23 | find_path(IBVERBS_INCLUDE_PATH
 24 |   NAMES "infiniband/verbs.h"
 25 |   HINTS ENV CPATH)
 26 | 
 27 | # google test
 28 | find_library(GOOGLETEST_MAIN_LIBRARY
 29 |   NAMES gtest_main libgtest_main libgtest_main.a
 30 |   HINTS
 31 |   ${GOOGLETEST_ROOT}
 32 |   ${GOOGLETEST_ROOT}/build
 33 |   ${GOOGLETEST_ROOT}/build/googlemock/gtest
 34 |   )
 35 | 
 36 | find_library(GOOGLETEST_LIBRARY
 37 |   NAMES gtest libgtest libgtest.a
 38 |   HINTS
 39 |   ${GOOGLETEST_ROOT}
 40 |   ${GOOGLETEST_ROOT}/build
 41 |   ${GOOGLETEST_ROOT}/build/googlemock/gtest
 42 |   )
 43 | 
 44 | find_path(GOOGLETEST_INCLUDE_PATH
 45 |   NAMES "gtest/gtest.h"
 46 |   HINTS
 47 |   ${GOOGLETEST_ROOT}
 48 |   ${GOOGLETEST_ROOT}/include
 49 |   ${GOOGLETEST_ROOT}/googletest/include)
 50 | 
 51 | message(STATUS "GOOGLETEST_ROOT=${GOOGLETEST_ROOT}")
 52 | message(STATUS "GOOGLETEST_LIBRARY=${GOOGLETEST_LIBRARY}")
 53 | message(STATUS "GOOGLETEST_INCLUDE_PATH=${GOOGLETEST_INCLUDE_PATH}")
 54 | 
 55 | # Run mpicxx -show to get compile flags for MPI
 56 | set(CMAKE_CXX_COMPILER "${CMAKE_SOURCE_DIR}/mpinvcc.sh")
 57 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter")
 58 | 
 59 | include_directories(".")
 60 | 
 61 | # for libraries
 62 | if(CUDA_FOUND)
 63 |     add_library(ibcomm_cuda SHARED
 64 |         ibcomm/ibverbs_communicator.cpp
 65 |         ibcomm/ibverbs_communicator_cuda.cpp
 66 |         ibcomm/memory_pool.cpp
 67 |         )
 68 |     if(USE_TRACE)
 69 |         target_compile_definitions(ibcomm_cuda
 70 |             PUBLIC "-DUSE_CUDA"
 71 |             PUBLIC "-DUSE_TRACE"
 72 |             )
 73 |     else()
 74 |         target_compile_definitions(ibcomm_cuda
 75 |             PUBLIC "-DUSE_CUDA"
 76 |             )
 77 |     endif()
 78 | 
 79 |     target_include_directories(ibcomm_cuda
 80 |         PUBLIC ${IBVERBS_INCLUDE_PATH}
 81 |         PUBLIC ${CUDA_INCLUDE_DIRS}
 82 |         )
 83 |     target_link_libraries(ibcomm_cuda
 84 |         ${IBVERBS_LIBRARY}
 85 |         ${CUDA_LIBRARIES}
 86 |         )
 87 | endif()
 88 | 
 89 | add_library(ibcomm SHARED ibcomm/ibverbs_communicator.cpp)
 90 | if(USE_TRACE)
 91 |     target_compile_definitions(ibcomm
 92 |         PUBLIC "-DUSE_TRACE"
 93 |         )
 94 | endif()
 95 | target_include_directories(ibcomm PUBLIC ${IBVERBS_INCLUDE_PATH})
 96 | target_link_libraries(ibcomm ${IBVERBS_LIBRARY})
 97 | 
 98 | # for tests
 99 | add_executable(sendrecv tests/sendrecv_test.cpp)
100 | target_link_libraries(sendrecv ibcomm)
101 | 
102 | if(GOOGLETEST_INCLUDE_PATH)
103 |   add_executable(unittest tests/unittest.cpp)
104 |   target_link_libraries(unittest ${GOOGLETEST_LIBRARY} ${GOOGLETEST_MAIN_LIBRARY})
105 |   target_include_directories(unittest PUBLIC ${GOOGLETEST_INCLUDE_PATH})
106 |   target_include_directories(unittest PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
107 | endif()
108 | 
109 | add_library(tinyexpr tinyexpr/tinyexpr.c)
110 | 
111 | if(CUDA_FOUND)
112 |     add_executable(allreduce_tester ibcomm/allreduce_tester.cpp)
113 |     target_include_directories(allreduce_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/tinyexpr)
114 |     target_include_directories(allreduce_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/grumpi)
115 |     target_link_libraries(allreduce_tester ibcomm_cuda)
116 |     target_link_libraries(allreduce_tester tinyexpr)
117 | endif()
118 | 
119 | # for allreduce examples
120 | add_executable(allreduce examples/allreduce.cpp)
121 | target_link_libraries(allreduce ibcomm)
122 | set_target_properties(allreduce PROPERTIES
123 |     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples"
124 | )
125 | 
126 | if(CUDA_FOUND)
127 |     add_executable(allreduce_cuda examples/allreduce.cpp)
128 |     target_compile_definitions(allreduce_cuda
129 |         PUBLIC "-DUSE_CUDA"
130 |     )
131 |     target_link_libraries(allreduce_cuda ibcomm_cuda)
132 |     set_target_properties(allreduce_cuda PROPERTIES
133 |         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples"
134 |     )
135 | endif()
136 | 


--------------------------------------------------------------------------------
/CPPLINT.cfg:
--------------------------------------------------------------------------------
1 | exclude_files=tinyexpr/*
2 | exclude_files=build/*
3 | exclude_files=grumpi/*
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017-2018 Preferred Networks, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PFNProto: AllReduce prototype implementation for NVIDIA GPUs and InfiniBand
  2 | PFNProto is a prototype library of the AllReduce collective operation.
  3 | The library is highly optimized for widely-used deep learning clusters equipped with NVIDIA GPUs and InfiniBand interconnect. It can achieve competitive performance to the fastest libraries in the world including NVIDIA NCCL.
  4 | 
  5 | PFNProto implements the following algorithms.
  6 | - Ring-AllReduce for CPU / CUDA
  7 | - Rabenseifner's Algorithm for CPU / CUDA
  8 | 
  9 | For more details, please refer to our blog ([English](https://preferredresearch.jp/2018/07/10/technologies-behind-distributed-deep-learning-allreduce/), [Japanese](https://research.preferred.jp/2018/07/prototype-allreduce-library/))
 10 | 
 11 | # How to build
 12 | ## Dependencies
 13 | - Infiniband Verbs
 14 | - CMake 2.8+
 15 | - MPI (to build examples and tests)
 16 | 
 17 | ## Build
 18 | ```sh
 19 | mkdir build
 20 | cd build
 21 | cmake ..
 22 | make
 23 | ```
 24 | 
 25 | # How to try the example
 26 | - Requirements: multi-node computing cluster equipped with NVIDIA GPUs and InfiniBand.
 27 | - Prepare hostfile
 28 | - Execute `examples/allreduce_cuda`
 29 |   - When PPN is not 1, this implementation isn't optimized. Therefore, selecting PPN=1 is needed.
 30 |   - (for Open MPI users) : `mpiexec -N 1 --hostfile "path_to_hostfile" examples/allreduce_cuda`
 31 |   - (for MPICH / MVAPICH2 users) : `mpiexec --ppn 1 --hostfile "path_to_hostfile" examples/allreduce_cuda`
 32 | - You'll see below result
 33 | ```
 34 | $ cd build
 35 | $ (prepare your hostfile)
 36 | $ cat hosts
 37 | node01
 38 | node02
 39 | ...
 40 | node08
 41 | $ mpiexec -N 1 --hostfile hosts examples/allreduce_cuda
 42 | rank: 3 OK
 43 | rank: 2 OK
 44 | rank: 4 OK
 45 | rank: 5 OK
 46 | rank: 6 OK
 47 | rank: 7 OK
 48 | rank: 1 OK
 49 | rank: 0 OK
 50 | elapsed time : 9.750750e-02 [s]
 51 | ```
 52 | 
 53 | # Contribution
 54 | Any contributions to this prototype are welcome.
 55 | Please feel free to report an issue or send a pull request!
 56 | 
 57 | # Limitations
 58 | This library is a prototype for algorithm and performance demonstration purpose.
 59 | It is not intended to be used in a production environment.
 60 | 
 61 | In particular, there are several limitations including:
 62 |  - Python binding is not provided.
 63 |  - Not being designed to be used together with ChainerMN.
 64 |  - Supported reduction operation is only plus(+).
 65 |  - Non-power-of-two extension of Rabenseifner's algorithm is not implemented.
 66 |  - It currently focuses on inter-node communication. Intra-node communication is not efficient because shared memory or GPU-to-GPU DMA data transfer is not implemented.
 67 | 
 68 | # Tuning Knobs
 69 | You can control runtime behaviours of PFNProto through the following environment variables.
 70 | Memory or buffer size are all in [byte] and SI prefix is not supported.
 71 | 
 72 | ## IBCOMM_CHUNKSIZE
 73 | - Chunk size in bytes for the AllReduce algorithm (both Ring-AllReduce and Rabenseifner's). PFNProto uses this size as a unit to execute every pipeline operation such as send, recv and reduction.
 74 | - Default value: (len(send/recvbuf) in bytes) / (4 * N_OF_PROCESSES)
 75 | - Supported range: (IBCOMM_CHUNKSIZE) <= (len(send/recvbuf) in bytes),  / (2 * N_OF_PROCESSES).
 76 | 
 77 | ## IBCOMM_MEMORY_POOL_PRE_ALLOC
 78 | - PFNProto allocates this size of MemoryPool to hide the latency of memory allocation.
 79 | - Default value: 67108864 (64 MiB).
 80 | 
 81 | ## IBCOMM_WORK_GPU_MEMORY_SIZE
 82 | - Initial Working GPU Memory size. PFNProto needs Working GPU Memory in rabenseifner's algorithm to save a reduction result. If this size is smaller than `IBCOMM_CHUNKSIZE`, runtime memory reallocation occurs.
 83 | - Default value: 33554432 (32 MiB).
 84 | 
 85 | ## IBCOMM_NUM_CUDA_STREAM
 86 | - Total number of CUDA streams used.
 87 | - Default value: 64
 88 | 
 89 | ## Number of pre-allocated chunks
 90 | - Number of pre-allocated chunks is computed from `IBCOMM_CHUNKSIZE` and `IBCOMM_MEMORY_POOL_PRE_ALLOC` by following equation:
 91 | - (IBCOMM_MEMORY_POOL_PRE_ALLOC) / (IBCOMM_CHUNKSIZE)
 92 | 
 93 | - Example: Let IBCOMM_CHUNKSIZE be 4 [MiB], and IBCOMM_MEMORY_POOL_PRE_ALLOC be 64 [MiB]. (IBCOMM_MEMORY_POOL_PRE_ALLOC) / (IBCOMM_CHUNKSIZE) = 64 / 4 = 16. Therefore, 16 chunks will be allocated and size of chunk is 4 [MiB].
 94 | 
 95 | # APIs
 96 | ```c++
 97 | class IBVerbsCommunicator {
 98 |  public:
 99 |   explicit IBVerbsCommunicator(int world_size);
100 | 
101 |   // Manages infiniband-related resources thus we need to delete copy and move ctors.
102 |   IBVerbsCommunicator(const IBVerbsCommunicator&) noexcept = delete;
103 |   IBVerbsCommunicator& operator=(const IBVerbsCommunicator&) noexcept = delete;
104 | 
105 |   // move
106 |   IBVerbsCommunicator(IBVerbsCommunicator&&) noexcept = delete;
107 |   IBVerbsCommunicator& operator=(IBVerbsCommunicator&&) noexcept = delete;
108 | 
109 |   // connection management
110 |   struct ProcessInfo RegisterProcess(int dest_rank, struct ProcessInfo pinfo);
111 |   struct ProcessInfo CreateQueuePair(int dest_rank);
112 |   void RegisterQueuePair(int dest_rank, struct ProcessInfo pinfo);
113 |   void RegisterMyself(int my_rank);
114 | 
115 |   void Send(int dest_rank, const void* buf, size_t len, bool blocking = true);
116 |   void Recv(int src_rank, void* buf, size_t len, bool blocking = true);
117 | 
118 |   // wait ( for non-blocking io )
119 |   bool SendPoll(int dest_rank);
120 |   bool RecvPoll(int src_rank);
121 |   void SendWait(int dest_rank);
122 |   void RecvWait(int src_rank);
123 | 
124 |   // allreduce
125 |   template <typename T>
126 |   void AllreduceRing(const T* sendbuf, T* recvbuf, size_t len_elements);
127 | 
128 |   template <typename T>
129 |   void AllreduceRabenseifner(const T* sendbuf, T* recvbuf, size_t len_elements);
130 | 
131 |   template <typename T>
132 |   void AllreduceRingCuda(const T* sendbuf, T* recvbuf, size_t len_elements);
133 | 
134 |   template <typename T>
135 |   void AllreduceRabenseifnerCuda(const T* sendbuf, T* recvbuf, size_t len_elements);
136 | 
137 |   int my_rank_;
138 |   size_t world_size_;
139 | };
140 | ```
141 | 
142 | ## Error Codes
143 | Error codes are defined at `ibcomm/util.h`.
144 | When error occurs, ibcomm returns these values as an exit code.
145 | 
146 | ```cpp
147 | enum class IBCOMM_ERROR_CODE : int {
148 |     INVALID_ARGUMENT = 1,
149 | 
150 |     // Error occured in InfiniBand Verbs call.
151 |     IBVERBS_ERROR = 2,
152 | 
153 |     // Error occured in CUDA call.
154 |     CUDA_ERROR = 3,
155 | 
156 |     NOT_SUPPORTED = 4
157 | };
158 | ```
159 | 
160 | # How to run unit tests
161 | ## Setup allreduce integration tests
162 | Integration tests of allreduce routines are implemented using `pytest` module.
163 | 
164 | ```
165 | $ pip install pytest
166 | ```
167 | 
168 | Unit tests depend on a few external libraries.
169 | 
170 | ```
171 | $ cd `Your cloned directory`
172 | $ git submodule init
173 | $ git submodule update
174 | $ mkdir -p build
175 | $ cd build
176 | $ cmake ..
177 | $ make -j
178 | 
179 | # Make sure `allreduce_tester` is generated
180 | ```
181 | 
182 | ## Run allreduce integration tests
183 | ```
184 | $ cd `Your cloned directory`
185 | $ pytest
186 | $ export HOSTFILE=hostfile          # Optional
187 | $ pytest --capture=no               # For more info
188 | $ pytest --capture=no -m "not slow" # Skip aging test
189 | ```
190 | 
191 | ## Setup and run C++ unit tests
192 | C++ unit tests depend on Google test (https://github.com/google/googletest).
193 | 
194 | First, download and build Google test.
195 | 
196 | ```
197 | $ WORKING_DIR=/tmp  # Your favorite directory
198 | $ cd ${WORKING_DIR}
199 | $ git clone https://github.com/google/googletest.git
200 | $ cd googletest
201 | $ mkdir build
202 | $ cmake ..
203 | 
204 | $ cd `Your cloned directory`
205 | $ cd build
206 | $ cmake -D GOOGLETEST_ROOT=${WORKING_DIR}/googletest ..
207 | $ make
208 | $ ./unittest
209 | 
210 | Running main() from 
211 | [==========] Running 4 tests from 1 test case.
212 | [----------] Global test environment set-up.
213 | [----------] 4 tests from IBCommUtilTest
214 | [ RUN      ] IBCommUtilTest.ParseNumberZero
215 | [       OK ] IBCommUtilTest.ParseNumberZero (0 ms)
216 | [ RUN      ] IBCommUtilTest.ParseNumberPositive
217 | [       OK ] IBCommUtilTest.ParseNumberPositive (0 ms)
218 | [ RUN      ] IBCommUtilTest.ParseNumberMalformed
219 | [       OK ] IBCommUtilTest.ParseNumberMalformed (0 ms)
220 | [ RUN      ] IBCommUtilTest.get_exp_of_two
221 | [       OK ] IBCommUtilTest.get_exp_of_two (0 ms)
222 | [----------] 4 tests from IBCommUtilTest (0 ms total)
223 | 
224 | [----------] Global test environment tear-down
225 | [==========] 4 tests from 1 test case ran. (0 ms total)
226 | [  PASSED  ] 4 tests.
227 | ```
228 | 
229 | # Coding guideline
230 | We adopt Google C++ Style Guide ( https://google.github.io/styleguide/cppguide.html ).
231 | 
232 | ```
233 | $ pip install cpplint
234 | $ cpplint --recursive .
235 | ```
236 | 
237 | # Acknowledgements
238 | We would like to thank Mr. Minoru Nakamura for his comprehensive document 
239 | on Infiniband Verbs API. (http://www.nminoru.jp/~nminoru/network/infiniband/) (In Japanese)
240 | 
241 | # LICENSE
242 | MIT License
243 | 


--------------------------------------------------------------------------------
/examples/allreduce.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #include <mpi.h>
  4 | 
  5 | #include <climits>
  6 | #include <iostream>
  7 | #include <random>
  8 | #include <vector>
  9 | 
 10 | #ifdef USE_CUDA
 11 | #include <cuda.h>
 12 | #include <cuda_runtime.h>
 13 | #include <thrust/device_vector.h>
 14 | #include <thrust/host_vector.h>
 15 | #endif
 16 | 
 17 | #include "ibcomm/ibverbs_communicator.h"
 18 | #include "ibcomm/util.h"  // CUDACHECK
 19 | 
 20 | // 256 [MiB] of allreduce float32 vector
 21 | #define ARRAY_LENGTH 67108864
 22 | 
 23 | // processes per node
 24 | // PPN must be 1 in this prototype implementation
 25 | // When PPN is not 1, this implementation isn't optimized.
 26 | #define PPN 1
 27 | 
 28 | // warmup times
 29 | #define WARMUP 3
 30 | 
 31 | double inline GetTime() {
 32 |   struct timespec ts;
 33 |   clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
 34 | 
 35 |   return ts.tv_sec + 1e-9 * ts.tv_nsec;
 36 | }
 37 | 
 38 | int main(int argc, char* argv[]) {
 39 |   MPI_Init(&argc, &argv);
 40 | 
 41 |   int mpi_rank, mpi_size;
 42 |   MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 43 |   MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
 44 | 
 45 | #ifdef USE_CUDA
 46 |   int intra_rank = mpi_rank % PPN;
 47 |   CUDACHECK(cudaSetDevice(intra_rank));
 48 | #endif
 49 | 
 50 |   IBVerbsCommunicator comm(mpi_size);
 51 | 
 52 |   std::vector<uint32_t> qps(mpi_size * 3);
 53 | 
 54 |   for (int i = 0; i < mpi_size; i++) {
 55 |     if (i == mpi_rank) {
 56 |       continue;
 57 |     }
 58 |     ProcessInfo pinfo = comm.CreateQueuePair(i);
 59 |     qps[i * 3 + 0] = pinfo.lid;
 60 |     qps[i * 3 + 1] = pinfo.qp_n;
 61 |     qps[i * 3 + 2] = pinfo.psn;
 62 |   }
 63 | 
 64 |   MPI_Alltoall(MPI_IN_PLACE, 3, MPI_UINT32_T, qps.data(), 3, MPI_UINT32_T,
 65 |                MPI_COMM_WORLD);
 66 | 
 67 |   for (int i = 0; i < mpi_size; i++) {
 68 |     if (i == mpi_rank) {
 69 |       comm.RegisterMyself(i);
 70 |     } else {
 71 |       ProcessInfo pinfo;
 72 |       pinfo.lid = qps[i * 3 + 0];
 73 |       pinfo.qp_n = qps[i * 3 + 1];
 74 |       pinfo.psn = qps[i * 3 + 2];
 75 |       comm.RegisterQueuePair(i, pinfo);
 76 |     }
 77 |   }
 78 | 
 79 | #ifdef USE_CUDA
 80 |   thrust::host_vector<float> sendbuf(ARRAY_LENGTH);
 81 |   thrust::host_vector<float> recvbuf(ARRAY_LENGTH);
 82 | #else
 83 |   std::vector<float> sendbuf(ARRAY_LENGTH);
 84 |   std::vector<float> recvbuf(ARRAY_LENGTH);
 85 | #endif
 86 | 
 87 |   // fixed seed
 88 |   std::mt19937 mt(0);
 89 |   // To avoid overflow, use short range of number.
 90 |   std::uniform_int_distribution<int> rand(-1000, 1000);
 91 | 
 92 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
 93 |     sendbuf[i] = rand(mt);
 94 |   }
 95 | 
 96 |   std::vector<float> answer(ARRAY_LENGTH, 0);
 97 |   MPI_Allreduce(sendbuf.data(), answer.data(), ARRAY_LENGTH, MPI_FLOAT, MPI_SUM,
 98 |                 MPI_COMM_WORLD);
 99 | 
100 |   double start, end;
101 | 
102 | #ifdef USE_CUDA
103 |   thrust::device_vector<float> gpu_sendbuf(ARRAY_LENGTH);
104 |   thrust::device_vector<float> gpu_recvbuf(ARRAY_LENGTH);
105 |   gpu_sendbuf = sendbuf;
106 | 
107 |   for (int i = 0; i < WARMUP + 1; i++) {
108 |     start = GetTime();
109 |     comm.AllreduceRingCuda(thrust::raw_pointer_cast(gpu_sendbuf.data()),
110 |                            thrust::raw_pointer_cast(gpu_recvbuf.data()),
111 |                            ARRAY_LENGTH);
112 |     /*
113 |      comm.AllreduceRabenseifnerCuda(thrust::raw_pointer_cast(gpu_sendbuf.data()),
114 |                                    thrust::raw_pointer_cast(gpu_recvbuf.data()),
115 |                                    ARRAY_LENGTH);
116 |                                    */
117 |     end = GetTime();
118 |   }
119 | 
120 |   recvbuf = gpu_recvbuf;
121 | #else
122 |   for (int i = 0; i < WARMUP + 1; i++) {
123 |     start = GetTime();
124 |     comm.AllreduceRing(sendbuf.data(), recvbuf.data(), ARRAY_LENGTH);
125 |     // comm.AllreduceRabenseifner(sendbuf.data(), recvbuf.data(), ARRAY_LENGTH);
126 |     end = GetTime();
127 |   }
128 | #endif
129 | 
130 |   bool ok = true;
131 |   for (int i = 0; i < ARRAY_LENGTH; i++) {
132 |     if (fabs(answer[i] - recvbuf[i]) > 1e-6) {
133 |       ok = false;
134 |       std::cout << "wrong at " << mpi_rank << " rank " << i << " element "
135 |                 << answer[i] << ":" << recvbuf[i] << std::endl;
136 |     } else {
137 |       //            std::cerr << "ok at " << mpi_rank << " rank " << i << "
138 |       //            element " << answer[i] << ":" << recvbuf[i] << std::endl;
139 |     }
140 |   }
141 | 
142 |   std::cout << "rank: " << mpi_rank << (ok ? " OK" : " FAIL") << std::endl;
143 | 
144 |   MPI_Finalize();
145 | 
146 |   if (mpi_rank == 0) printf("elapsed time : %e [s]\n", end - start);
147 | 
148 |   return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/ibcomm/allreduce_cpu_impl.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include <cassert>
  6 | 
  7 | #include <algorithm>
  8 | #include <iostream>
  9 | #include <queue>
 10 | #include <utility>
 11 | #include <vector>
 12 | 
 13 | #include "ibcomm/ibverbs_communicator.h"
 14 | #include "ibcomm/util.h"
 15 | 
 16 | template <typename T>
 17 | void _reduce_inplace(T* result, const T* value, size_t len_elements) {
 18 |   for (size_t i = 0; i < len_elements; i++) {
 19 |     result[i] += value[i];
 20 |   }
 21 | }
 22 | 
 23 | template <typename T>
 24 | void IBVerbsCommunicator::AllreduceRing(const T* sendbuf, T* recvbuf,
 25 |                                         size_t len_elements) {
 26 |   if (world_size_ == 1) {
 27 |     memcpy(recvbuf, sendbuf, sizeof(T) * len_elements);
 28 |     return;
 29 |   }
 30 | 
 31 |   auto ranges = SplitBuffer(len_elements, sizeof(T));
 32 |   auto rank_to_chunk = GetRankToChunk(ranges);
 33 | 
 34 |   auto chunks = ranges.size();
 35 | 
 36 |   int from_rank = (my_rank_ - 1 + world_size_) % world_size_;
 37 |   int to_rank = (my_rank_ + 1) % world_size_;
 38 | 
 39 |   std::queue<int> reg_q;
 40 |   for (int i = 0; i < world_size_ - 1; i++) {
 41 |     // [my_rank_ - 1, my_rank_)
 42 |     int rank = (my_rank_ - i - 1 + world_size_) % world_size_;
 43 | 
 44 |     for (auto it = rank_to_chunk[rank].rbegin();
 45 |          it != rank_to_chunk[rank].rend(); ++it) {
 46 |       reg_q.push(*it);
 47 |     }
 48 |   }
 49 | 
 50 |   // send_mrs' mr needs SendPoll before deregistration
 51 |   // (used in ReduceScatter, AllGather).
 52 |   // However, recv_mrs' mr can deregistration immediately (used in AllGather).
 53 |   std::queue<struct ibv_mr*> send_mrs;
 54 |   std::queue<struct ibv_mr*> recv_mrs;
 55 | 
 56 |   // cached mrs
 57 |   std::vector<struct ibv_mr*> mrs(chunks, NULL);
 58 | 
 59 |   // HCA's Q length
 60 |   int send_q_elements = 0;
 61 |   int recv_q_elements = 0;
 62 | 
 63 |   std::queue<int> first_send_q;
 64 |   std::queue<int> first_send_q_buffering;
 65 | 
 66 |   for (auto it = rank_to_chunk[my_rank_].rbegin();
 67 |        it != rank_to_chunk[my_rank_].rend(); ++it) {
 68 |     first_send_q.push(*it);
 69 |   }
 70 | 
 71 |   int current_recv_i = reg_q.front();
 72 |   bool reduce_scatter_done = false;
 73 | 
 74 |   // ReduceScatter
 75 |   while (!reduce_scatter_done) {
 76 |     while ((reg_q.empty() || recv_q_elements > 0) && RecvPoll(from_rank)) {
 77 |       recv_q_elements--;
 78 | 
 79 |       auto range = ranges[current_recv_i];
 80 |       size_t offset_elements = range.first;
 81 |       size_t elements = (range.second - range.first);
 82 |       size_t bytes = elements * sizeof(T);
 83 | 
 84 |       _reduce_inplace(recvbuf + offset_elements, sendbuf + offset_elements,
 85 |                       elements);
 86 | 
 87 |       if (current_recv_i == rank_to_chunk[to_rank].front()) {
 88 |         reduce_scatter_done = true;
 89 |       }
 90 | 
 91 |       if (!(rank_to_chunk[to_rank].front() <= current_recv_i &&
 92 |             current_recv_i <= rank_to_chunk[to_rank].back())) {
 93 |         if (first_send_q.empty() && first_send_q_buffering.empty()) {
 94 |           SendRegistered(to_rank, recvbuf + offset_elements,
 95 |                          mrs[current_recv_i], bytes, false);
 96 |           send_q_elements++;
 97 |         } else {
 98 |           first_send_q_buffering.push(current_recv_i);
 99 |         }
100 |       }
101 | 
102 |       current_recv_i = (current_recv_i - 1 + chunks) % chunks;
103 |     }
104 | 
105 |     if (!reg_q.empty()) {
106 |       int recv_key = reg_q.front();
107 |       reg_q.pop();
108 | 
109 |       auto range = ranges[recv_key];
110 |       size_t offset_elements = range.first;
111 |       size_t elements = (range.second - range.first);
112 |       size_t bytes = elements * sizeof(T);
113 | 
114 |       mrs[recv_key] = RegisterRecvBuf(recvbuf + offset_elements, bytes);
115 | 
116 |       RecvRegistered(from_rank, recvbuf + offset_elements, mrs[recv_key], bytes,
117 |                      false);
118 |       recv_q_elements++;
119 |     }
120 | 
121 |     if (!first_send_q.empty()) {
122 |       int i = first_send_q.front();
123 |       first_send_q.pop();
124 | 
125 |       auto range = ranges[i];
126 | 
127 |       size_t offset_elements = range.first;
128 |       size_t elements = (range.second - range.first);
129 |       size_t bytes = elements * sizeof(T);
130 | 
131 |       auto mr = RegisterSendBuf(sendbuf + offset_elements, bytes);
132 | 
133 |       SendRegistered(to_rank, sendbuf + offset_elements, mr, bytes, false);
134 |       send_q_elements++;
135 |       send_mrs.push(mr);
136 |     } else {
137 |       while (!first_send_q_buffering.empty()) {
138 |         int i = first_send_q_buffering.front();
139 |         first_send_q_buffering.pop();
140 | 
141 |         auto range = ranges[i];
142 |         size_t offset_elements = range.first;
143 |         size_t elements = (range.second - range.first);
144 |         size_t bytes = elements * sizeof(T);
145 | 
146 |         SendRegistered(to_rank, recvbuf + offset_elements, mrs[i], bytes,
147 |                        false);
148 |         send_q_elements++;
149 |       }
150 |     }
151 | 
152 |     while (SendPoll(to_rank)) {
153 |       send_q_elements--;
154 |       if (!send_mrs.empty()) {
155 |         PopMrAndDereg(&send_mrs);
156 |       }
157 |     }
158 | 
159 |     for (auto it = rank_to_chunk[my_rank_].begin();
160 |          it != rank_to_chunk[my_rank_].end(); ++it) {
161 |       auto range = ranges[*it];
162 |       size_t offset_elements = range.first;
163 |       size_t elements = (range.second - range.first);
164 |       size_t bytes = elements * sizeof(T);
165 | 
166 |       if (mrs[*it] == NULL) {
167 |         mrs[*it] = RegisterRecvBuf(recvbuf + offset_elements, bytes);
168 | 
169 |         // when 1 chunk is registered, exit this loop to recv early.
170 |         break;
171 |       }
172 |     }
173 |   }
174 | 
175 |   // need sync before AllGather
176 |   assert(recv_q_elements == 0);
177 |   while (send_q_elements != 0) {
178 |     SendWait(to_rank);
179 |     send_q_elements--;
180 | 
181 |     if (!send_mrs.empty()) {
182 |       PopMrAndDereg(&send_mrs);
183 |     }
184 |   }
185 | 
186 |   // AllGather
187 |   for (int i = 0; i < world_size_; i++) {
188 |     int rank = (1 + my_rank_ - i + world_size_) % world_size_;
189 | 
190 |     for (auto it = rank_to_chunk[rank].rbegin();
191 |          it != rank_to_chunk[rank].rend(); ++it) {
192 |       auto range = ranges[*it];
193 |       size_t offset_elements = range.first;
194 |       size_t elements = (range.second - range.first);
195 |       size_t bytes = elements * sizeof(T);
196 | 
197 |       if (rank != (my_rank_ + 1) % world_size_) {
198 |         RecvRegistered(from_rank, recvbuf + offset_elements, mrs[*it], bytes,
199 |                        false);
200 | 
201 |         while (!RecvPoll(from_rank)) {
202 |           if (SendPoll(to_rank)) {
203 |             send_q_elements--;
204 | 
205 |             assert(!send_mrs.empty());
206 | 
207 |             PopMrAndDereg(&send_mrs);
208 |           } else if (!recv_mrs.empty()) {
209 |             PopMrAndDereg(&recv_mrs);
210 |           }
211 |         }
212 |       }
213 | 
214 |       if (rank != (my_rank_ + 2) % world_size_) {
215 |         SendRegistered(to_rank, recvbuf + offset_elements, mrs[*it], bytes,
216 |                        false);
217 |         send_mrs.push(mrs[*it]);
218 |         send_q_elements++;
219 |       } else {
220 |         recv_mrs.push(mrs[*it]);
221 |       }
222 | 
223 |       mrs[*it] = NULL;
224 |     }
225 |   }
226 | 
227 |   while (send_q_elements != 0) {
228 |     SendWait(to_rank);
229 |     send_q_elements--;
230 | 
231 |     if (!send_mrs.empty()) {
232 |       PopMrAndDereg(&send_mrs);
233 |     }
234 |   }
235 |   assert(send_mrs.empty());
236 | 
237 |   assert(recv_q_elements == 0);
238 | 
239 |   while (!recv_mrs.empty()) {
240 |     PopMrAndDereg(&recv_mrs);
241 |   }
242 | 
243 |   return;
244 | }
245 | 
246 | template <typename T>
247 | void IBVerbsCommunicator::AllreduceRabenseifner(const T* sendbuf, T* recvbuf,
248 |                                                 size_t len_elements) {
249 |   if (world_size_ == 1) {
250 |     memcpy(recvbuf, sendbuf, sizeof(T) * len_elements);
251 |     return;
252 |   }
253 | 
254 |   int world_size_exp = util::GetExpOfTwo(world_size_);
255 | 
256 |   // check world_size is power-of-2 or not
257 |   if (world_size_exp == 0) {
258 |     util::IbcommError(__FILE__, __LINE__,
259 |                       util::IBCOMM_ERROR_CODE::NOT_SUPPORTED,
260 |                       "Currently, rabenseifner's algorithm doesn't support "
261 |                       "non-power-of-2 processes.");
262 |   }
263 | 
264 |   std::vector<std::pair<size_t, size_t>> ranges;
265 |   for (int i = 0; i < world_size_; i++) {
266 |     int range_length = util::ceilDiv(len_elements, world_size_);
267 | 
268 |     ranges.emplace_back(
269 |         range_length * i,
270 |         std::min(range_length * (i + 1), static_cast<int>(len_elements)));
271 |   }
272 | 
273 |   T* tmp_buffer = static_cast<T*>(malloc(sizeof(T) * len_elements));
274 |   if (tmp_buffer == NULL) {
275 |     std::cerr << "Allocation of tmp-buffer failed" << std::endl;
276 |     return;
277 |   }
278 | 
279 |   memcpy(recvbuf, sendbuf, sizeof(T) * len_elements);
280 | 
281 |   // process maintains current chunk_range [start_chunk, end_chunk).
282 |   int start_chunk = 0;
283 |   int end_chunk = ranges.size();
284 |   // Reduce-Scatter (recursive halving)
285 |   for (int step = 0; step < world_size_exp; step++) {
286 |     int to_rank = my_rank_ ^ (1 << step);
287 | 
288 |     int send_chunk_start, send_chunk_end, recv_chunk_start, recv_chunk_end;
289 |     if (my_rank_ < to_rank) {
290 |       // I send front chunk
291 |       send_chunk_start = start_chunk;
292 |       send_chunk_end = end_chunk - (end_chunk - start_chunk) / 2;
293 |       recv_chunk_start = send_chunk_end;
294 |       recv_chunk_end = end_chunk;
295 |     } else {
296 |       // I send back chunk
297 |       recv_chunk_start = start_chunk;
298 |       recv_chunk_end = end_chunk - (end_chunk - start_chunk) / 2;
299 |       send_chunk_start = recv_chunk_end;
300 |       send_chunk_end = end_chunk;
301 |     }
302 | 
303 |     Send(to_rank, recvbuf + ranges[send_chunk_start].first,
304 |          sizeof(T) *
305 |              (ranges[(send_chunk_end - 1 + ranges.size()) % ranges.size()]
306 |                   .second -
307 |               ranges[send_chunk_start].first),
308 |          false);
309 | 
310 |     Recv(to_rank, tmp_buffer + ranges[recv_chunk_start].first,
311 |          sizeof(T) *
312 |              (ranges[(recv_chunk_end - 1 + ranges.size()) % ranges.size()]
313 |                   .second -
314 |               ranges[recv_chunk_start].first),
315 |          false);
316 | 
317 |     RecvWait(to_rank);
318 | 
319 |     _reduce_inplace(
320 |         recvbuf + ranges[recv_chunk_start].first,
321 |         tmp_buffer + ranges[recv_chunk_start].first,
322 |         ranges[(recv_chunk_end - 1 + ranges.size()) % ranges.size()].second -
323 |             ranges[recv_chunk_start].first);
324 | 
325 |     SendWait(to_rank);
326 | 
327 |     start_chunk = recv_chunk_start;
328 |     end_chunk = recv_chunk_end;
329 |   }
330 | 
331 |   // AllGather (recursive doubling)
332 |   for (int step = 0; step < world_size_exp; step++) {
333 |     int to_rank = my_rank_ ^ (1 << (world_size_exp - step - 1));
334 | 
335 |     int send_chunk_start, send_chunk_end, recv_chunk_start, recv_chunk_end;
336 |     if (my_rank_ > to_rank) {
337 |       // I send front chunk
338 |       send_chunk_start = start_chunk;
339 |       send_chunk_end = end_chunk;
340 |       recv_chunk_start = send_chunk_end;
341 |       recv_chunk_end = recv_chunk_start + end_chunk - start_chunk;
342 |     } else {
343 |       // I send back chunk
344 |       send_chunk_start = start_chunk;
345 |       send_chunk_end = end_chunk;
346 |       recv_chunk_end = send_chunk_start;
347 |       recv_chunk_start = recv_chunk_end - (end_chunk - start_chunk);
348 |     }
349 | 
350 |     Send(to_rank, recvbuf + ranges[send_chunk_start].first,
351 |          sizeof(T) *
352 |              (ranges[(send_chunk_end - 1 + ranges.size()) % ranges.size()]
353 |                   .second -
354 |               ranges[send_chunk_start].first),
355 |          false);
356 | 
357 |     Recv(to_rank, recvbuf + ranges[recv_chunk_start].first,
358 |          sizeof(T) *
359 |              (ranges[(recv_chunk_end - 1 + ranges.size()) % ranges.size()]
360 |                   .second -
361 |               ranges[recv_chunk_start].first),
362 |          false);
363 | 
364 |     RecvWait(to_rank);
365 |     SendWait(to_rank);
366 | 
367 |     start_chunk = std::min(send_chunk_start, recv_chunk_start);
368 |     end_chunk = std::max(send_chunk_end, recv_chunk_end);
369 |   }
370 | 
371 |   free(tmp_buffer);
372 | }
373 | 


--------------------------------------------------------------------------------
/ibcomm/allreduce_cuda_impl.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #ifdef USE_CUDA
  6 | 
  7 | #include <cuda.h>
  8 | #include <cuda_runtime.h>
  9 | 
 10 | #include <algorithm>
 11 | #include <queue>
 12 | #include <vector>
 13 | 
 14 | #include "ibcomm/ibverbs_communicator.h"
 15 | #include "ibcomm/memory_pool.h"
 16 | #include "ibcomm/util.h"
 17 | 
 18 | #ifdef USE_TRACE
 19 | #define TRACE(NAME) util::trace(&NAME);
 20 | #else
 21 | #define TRACE(NAME)
 22 | #endif
 23 | 
 24 | #define THREADS 512
 25 | 
 26 | template <typename T>
 27 | __global__ void _reduce_inplace_cuda(T* result, const T* value,
 28 |                                      size_t len_elements) {
 29 | #pragma unroll
 30 |   for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < len_elements;
 31 |        index += blockDim.x * gridDim.x)
 32 |     result[index] += value[index];
 33 | }
 34 | 
 35 | template <typename T>
 36 | void IBVerbsCommunicator::AllreduceRingCuda(const T* sendbuf, T* recvbuf,
 37 |                                             size_t len_elements) {
 38 |   TRACE(trace_other_);
 39 | 
 40 |   if (world_size_ == 1) {
 41 |     CUDACHECK(cudaMemcpy(recvbuf, sendbuf, sizeof(T) * len_elements,
 42 |                          cudaMemcpyDefault));
 43 |     TRACE(trace_other_);
 44 | 
 45 |     return;
 46 |   }
 47 | 
 48 |   auto ranges = SplitBuffer(len_elements, sizeof(T));
 49 |   auto rank_to_chunk = GetRankToChunk(ranges);
 50 | 
 51 |   auto chunks = ranges.size();
 52 | 
 53 |   int from_rank = (my_rank_ - 1 + world_size_) % world_size_;
 54 |   int to_rank = (my_rank_ + 1) % world_size_;
 55 | 
 56 |   auto controller =
 57 |       pool_->GetController((ranges[0].second - ranges[0].first) * sizeof(T));
 58 | 
 59 |   std::vector<Memory*> chunk_to_memory(chunks, NULL);
 60 | 
 61 |   std::queue<int> reg_q;
 62 |   // Reduce-Scatter's recv
 63 |   for (int i = 0; i < world_size_ - 1; i++) {
 64 |     // [my_rank_ - 1, my_rank_)
 65 |     int rank = (my_rank_ - i - 1 + world_size_) % world_size_;
 66 | 
 67 |     for (auto it = rank_to_chunk[rank].rbegin();
 68 |          it != rank_to_chunk[rank].rend(); ++it) {
 69 |       reg_q.push(*it);
 70 |     }
 71 |   }
 72 | 
 73 |   // AllGather's recv
 74 |   for (int i = 0; i < world_size_ - 1; i++) {
 75 |     // [my_rank_, my_rank_ -1, ...,  my_rank_ + 1)
 76 |     int rank = (my_rank_ - i + world_size_) % world_size_;
 77 |     for (auto it = rank_to_chunk[rank].rbegin();
 78 |          it != rank_to_chunk[rank].rend(); ++it) {
 79 |       reg_q.push(*it);
 80 |     }
 81 |   }
 82 | 
 83 |   std::queue<int> first_send_q;
 84 |   std::queue<int> first_send_q_buffering;
 85 | 
 86 |   for (auto it = rank_to_chunk[my_rank_].rbegin();
 87 |        it != rank_to_chunk[my_rank_].rend(); ++it) {
 88 |     first_send_q.push(*it);
 89 |   }
 90 | 
 91 |   int current_recv_i = reg_q.front();
 92 | 
 93 |   std::queue<int> wait_send_q;
 94 |   std::queue<int> wait_reduction_q;
 95 |   std::queue<int> wait_send_completion_q;
 96 |   int remaining_recv_q_length = 0;
 97 |   bool reduce_scatter_phase = true;
 98 |   // last rank (end of allgather)
 99 |   const int final_rank = (my_rank_ + 2) % world_size_;
100 | 
101 |   TRACE(trace_other_);
102 | 
103 |   while (true) {
104 |     while ((reduce_scatter_phase || wait_reduction_q.empty()) &&
105 |            RecvPoll(from_rank)) {
106 |       TRACE(trace_received_);
107 | 
108 |       remaining_recv_q_length--;
109 |       auto range = ranges[current_recv_i];
110 |       size_t offset_elements = range.first;
111 |       size_t elements = (range.second - range.first);
112 |       size_t bytes = elements * sizeof(T);
113 | 
114 |       using util::ceilDiv;
115 | 
116 |       const auto blocks =
117 |           std::min(ceilDiv(elements, (size_t)THREADS), (size_t)(65535));
118 | 
119 |       auto& mem = chunk_to_memory[current_recv_i];
120 | 
121 |       TRACE(trace_received_);
122 | 
123 |       if (reduce_scatter_phase) {
124 |         TRACE(trace_issue_redu_kernel_);
125 |       } else {
126 |         TRACE(trace_issue_copy_kernel_);
127 |       }
128 | 
129 |       CUDACHECK(cudaMemcpyAsync(recvbuf + offset_elements, mem->ptr(), bytes,
130 |                                 cudaMemcpyDefault, mem->stream()));
131 |       if (reduce_scatter_phase) {
132 |         _reduce_inplace_cuda<<<blocks, THREADS, 0, mem->stream()>>>(
133 |             recvbuf + offset_elements, sendbuf + offset_elements, elements);
134 |         CUDACHECK(cudaMemcpyAsync(mem->ptr(), recvbuf + offset_elements, bytes,
135 |                                   cudaMemcpyDefault, mem->stream()));
136 |         wait_reduction_q.push(current_recv_i);
137 |         if (current_recv_i == rank_to_chunk[to_rank].front()) {
138 |           reduce_scatter_phase = false;
139 |         }
140 | 
141 |         TRACE(trace_issue_redu_kernel_);
142 |       } else {
143 |         TRACE(trace_issue_copy_kernel_);
144 |         if (current_recv_i < rank_to_chunk[final_rank].front() ||
145 |             rank_to_chunk[final_rank].back() < current_recv_i) {
146 |           TRACE(trace_issue_send_);
147 | 
148 |           SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false);
149 |           wait_send_completion_q.push(current_recv_i);
150 | 
151 |           TRACE(trace_issue_send_);
152 |         } else {
153 |           // NO NEED SEND because this is last allgather step.
154 |           CUDACHECK(cudaStreamSynchronize(mem->stream()));
155 |           controller.returnMemory(mem);
156 |           mem = NULL;
157 |         }
158 |       }
159 |       current_recv_i = (current_recv_i - 1 + chunks) % chunks;
160 |     }
161 | 
162 |     // This means rank_to_chunk[final_rank].front() == current_recv_i in
163 |     // RecvPoll loop (current_recv_i is already decremented)
164 |     if (!reduce_scatter_phase && wait_reduction_q.empty() &&
165 |         current_recv_i ==
166 |             (rank_to_chunk[final_rank].front() - 1 + chunks) % chunks)
167 |       break;  // DONE!
168 | 
169 |     if (!wait_reduction_q.empty() &&
170 |         cudaStreamQuery(chunk_to_memory[wait_reduction_q.front()]->stream()) ==
171 |             cudaSuccess) {
172 |       TRACE(trace_reduced_);
173 | 
174 |       int i = wait_reduction_q.front();
175 |       wait_reduction_q.pop();
176 | 
177 |       auto range = ranges[i];
178 |       size_t elements = (range.second - range.first);
179 |       size_t bytes = elements * sizeof(T);
180 | 
181 |       TRACE(trace_reduced_);
182 |       // This send is reduce-scatter phase send and allgather phase first send,
183 |       // thus we can send all chunk.
184 |       if (first_send_q.empty() && wait_send_q.empty() &&
185 |           first_send_q_buffering.empty()) {
186 |         TRACE(trace_issue_send_);
187 | 
188 |         auto mem = chunk_to_memory[i];
189 |         SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false);
190 |         wait_send_completion_q.push(i);
191 | 
192 |         TRACE(trace_issue_send_);
193 |       } else {
194 |         first_send_q_buffering.push(i);
195 |       }
196 |     }
197 | 
198 |     // When first_send is not completed, We cannot issue first-send's recv.
199 |     if (remaining_recv_q_length <= 2 && !reg_q.empty() &&
200 |         chunk_to_memory[reg_q.front()] == NULL) {
201 |       TRACE(trace_issue_recv_);
202 | 
203 |       int recv_key = reg_q.front();
204 |       reg_q.pop();
205 | 
206 |       auto range = ranges[recv_key];
207 |       size_t elements = (range.second - range.first);
208 |       size_t bytes = elements * sizeof(T);
209 | 
210 |       auto mem = chunk_to_memory[recv_key] = controller.getMemory();
211 | 
212 |       remaining_recv_q_length++;
213 |       RecvRegistered(from_rank, mem->ptr(), mem->mr(), bytes, false);
214 | 
215 |       TRACE(trace_issue_recv_);
216 |     }
217 | 
218 |     if (!first_send_q.empty()) {
219 |       TRACE(trace_issue_copy_kernel_);
220 | 
221 |       int i = first_send_q.front();
222 |       first_send_q.pop();
223 | 
224 |       auto range = ranges[i];
225 |       size_t offset_elements = range.first;
226 |       size_t elements = (range.second - range.first);
227 |       size_t bytes = elements * sizeof(T);
228 | 
229 |       auto mem = chunk_to_memory[i] = controller.getMemory();
230 |       CUDACHECK(cudaMemcpyAsync(mem->ptr(), sendbuf + offset_elements, bytes,
231 |                                 cudaMemcpyDefault, mem->stream()));
232 |       wait_send_q.push(i);
233 | 
234 |       TRACE(trace_issue_copy_kernel_);
235 |     }
236 | 
237 |     if (!wait_send_q.empty() &&
238 |         cudaStreamQuery(chunk_to_memory[wait_send_q.front()]->stream()) ==
239 |             cudaSuccess) {
240 |       TRACE(trace_issue_send_);
241 | 
242 |       int i = wait_send_q.front();
243 |       wait_send_q.pop();
244 | 
245 |       auto range = ranges[i];
246 |       size_t elements = (range.second - range.first);
247 |       size_t bytes = elements * sizeof(T);
248 | 
249 |       auto mem = chunk_to_memory[i];
250 | 
251 |       SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false);
252 | 
253 |       wait_send_completion_q.push(i);
254 | 
255 |       TRACE(trace_issue_send_);
256 |     } else if (first_send_q.empty() && wait_send_q.empty() &&
257 |                !first_send_q_buffering.empty()) {
258 |       TRACE(trace_issue_send_);
259 |       while (!first_send_q_buffering.empty()) {
260 |         int i = first_send_q_buffering.front();
261 |         first_send_q_buffering.pop();
262 | 
263 |         auto range = ranges[i];
264 |         size_t elements = (range.second - range.first);
265 |         size_t bytes = elements * sizeof(T);
266 | 
267 |         auto mem = chunk_to_memory[i];
268 | 
269 |         SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false);
270 |         wait_send_completion_q.push(i);
271 |       }
272 |       TRACE(trace_issue_send_);
273 |     }
274 | 
275 |     while (SendPoll(to_rank)) {
276 |       TRACE(trace_other_);
277 | 
278 |       int complete_send_chunk_id = wait_send_completion_q.front();
279 |       wait_send_completion_q.pop();
280 |       CUDACHECK(cudaStreamSynchronize(
281 |           chunk_to_memory[complete_send_chunk_id]->stream()));
282 |       controller.returnMemory(chunk_to_memory[complete_send_chunk_id]);
283 |       chunk_to_memory[complete_send_chunk_id] = NULL;
284 | 
285 |       TRACE(trace_other_);
286 |     }
287 |   }
288 | 
289 |   TRACE(trace_other_);
290 |   while (!wait_send_completion_q.empty()) {
291 |     SendWait(to_rank);
292 |     int complete_send_chunk_id = wait_send_completion_q.front();
293 |     wait_send_completion_q.pop();
294 |     // We need sync because memcpy is issued.
295 |     CUDACHECK(cudaStreamSynchronize(
296 |         chunk_to_memory[complete_send_chunk_id]->stream()));
297 |     controller.returnMemory(chunk_to_memory[complete_send_chunk_id]);
298 |     chunk_to_memory[complete_send_chunk_id] = NULL;
299 |   }
300 |   TRACE(trace_other_);
301 | }
302 | 
303 | class Chunk {
304 |  public:
305 |   int range_id_;
306 | 
307 |   int depth_;
308 | 
309 |   // pair_rank_ is no meaning in some context.
310 |   int pair_rank_;
311 | 
312 |   // reduce_ is no meaning in some context.
313 |   bool reduce_;
314 | 
315 |   bool last_;
316 | 
317 |   Chunk(int range_id, int depth, int pair_rank = -1, bool reduce = false,
318 |         bool last = false)
319 |       : range_id_(range_id),
320 |         depth_(depth),
321 |         pair_rank_(pair_rank),
322 |         reduce_(reduce),
323 |         last_(last) {}
324 | };
325 | 
326 | template <typename T>
327 | void IBVerbsCommunicator::AllreduceRabenseifnerCuda(const T* sendbuf,
328 |                                                     T* recvbuf,
329 |                                                     size_t len_elements) {
330 |   TRACE(trace_other_);
331 |   bool first_memcpy_done = false;
332 |   CUDACHECK(cudaMemcpyAsync(recvbuf, sendbuf, sizeof(T) * len_elements,
333 |                             cudaMemcpyDefault));
334 | 
335 |   if (world_size_ == 1) {
336 |     CUDACHECK(cudaStreamSynchronize(0));
337 | 
338 |     TRACE(trace_other_);
339 |     return;
340 |   }
341 | 
342 |   int world_size_exp = util::GetExpOfTwo(world_size_);
343 | 
344 |   // check world_size is power-of-2 or not
345 |   if (world_size_exp == 0) {
346 |     TRACE(trace_other_);
347 |     util::IbcommError(__FILE__, __LINE__,
348 |                       util::IBCOMM_ERROR_CODE::NOT_SUPPORTED,
349 |                       "Currently, rabenseifner's algorithm doesn't support "
350 |                       "non-power-of-2 processes.");
351 |   }
352 | 
353 |   auto ranges = SplitBuffer(len_elements, sizeof(T));
354 |   auto rank_to_chunk = GetRankToChunk(ranges);
355 | 
356 |   auto chunks = ranges.size();
357 | 
358 |   auto controller =
359 |       pool_->GetController((ranges[0].second - ranges[0].first) * sizeof(T));
360 |   std::vector<Memory*> chunk_to_memory(chunks, NULL);
361 | 
362 |   std::vector<bool> reduced_chunk(chunks, false);
363 |   std::queue<Chunk> recv_q;
364 |   std::queue<Chunk> wait_recv_q;
365 |   std::queue<Chunk> send_q;
366 |   std::queue<Chunk> wait_send_q;
367 |   std::queue<Chunk> wait_send_copy_q;
368 |   std::queue<Chunk> wait_reduction_q;
369 |   std::queue<Chunk> first_send_q;
370 |   std::queue<Chunk> first_send_q_buffering;
371 | 
372 |   // GPU working memory size check and realloc if need
373 |   if (tmp_gpu_buffer_size_ < sizeof(T) * (ranges[0].second - ranges[0].first)) {
374 |     // dealloc
375 |     util::IbcommWarning(
376 |         __FILE__, __LINE__,
377 |         "IBCOMM_GPU_WORK_MEMORY_SIZE is smaller than chunk size.\n"
378 |         "runtime-reallocation is occured.");
379 |     CUDACHECK(cudaFree(tmp_gpu_buffer_));
380 | 
381 |     // alloc
382 |     tmp_gpu_buffer_size_ = sizeof(T) * (ranges[0].second - ranges[0].first);
383 |     CUDACHECK(cudaMalloc(static_cast<void**>(&tmp_gpu_buffer_),
384 |                          tmp_gpu_buffer_size_));
385 |   }
386 | 
387 |   int start_rank = 0;
388 |   int end_rank = world_size_;
389 | 
390 |   // Reduce-Scatter (recursive halving)
391 |   for (int step = 0; step < world_size_exp; step++) {
392 |     int to_rank = my_rank_ ^ (1 << step);
393 | 
394 |     int send_rank_start, send_rank_end, recv_rank_start, recv_rank_end;
395 |     if (my_rank_ < to_rank) {
396 |       // I send front rank
397 |       send_rank_start = start_rank;
398 |       send_rank_end = end_rank - (end_rank - start_rank) / 2;
399 |       recv_rank_start = send_rank_end;
400 |       recv_rank_end = end_rank;
401 |     } else {
402 |       // I send back rank
403 |       recv_rank_start = start_rank;
404 |       recv_rank_end = end_rank - (end_rank - start_rank) / 2;
405 |       send_rank_start = recv_rank_end;
406 |       send_rank_end = end_rank;
407 |     }
408 | 
409 |     for (int recv_rank = recv_rank_start; recv_rank < recv_rank_end;
410 |          recv_rank++) {
411 |       for (auto chunk : rank_to_chunk[recv_rank]) {
412 |         recv_q.emplace(chunk, step, to_rank, true);
413 |       }
414 |     }
415 |     for (int send_rank = send_rank_start; send_rank < send_rank_end;
416 |          send_rank++) {
417 |       for (auto chunk : rank_to_chunk[send_rank]) {
418 |         if (step == 0) {
419 |           first_send_q.emplace(chunk, step, to_rank);
420 |         } else {
421 |           send_q.emplace(chunk, step, to_rank);
422 |         }
423 |       }
424 |     }
425 | 
426 |     start_rank = recv_rank_start;
427 |     end_rank = recv_rank_end;
428 |   }
429 | 
430 |   // AllGather (recursive doubling)
431 |   for (int step = 0; step < world_size_exp; step++) {
432 |     int to_rank = my_rank_ ^ (1 << (world_size_exp - step - 1));
433 | 
434 |     int send_rank_start, send_rank_end, recv_rank_start, recv_rank_end;
435 |     if (my_rank_ > to_rank) {
436 |       // I send front rank
437 |       send_rank_start = start_rank;
438 |       send_rank_end = end_rank;
439 |       recv_rank_start = send_rank_end;
440 |       recv_rank_end = recv_rank_start + end_rank - start_rank;
441 |     } else {
442 |       // I send back rank
443 |       send_rank_start = start_rank;
444 |       send_rank_end = end_rank;
445 |       recv_rank_end = send_rank_start;
446 |       recv_rank_start = recv_rank_end - (end_rank - start_rank);
447 |     }
448 | 
449 |     for (int recv_rank = recv_rank_start; recv_rank < recv_rank_end;
450 |          recv_rank++) {
451 |       for (auto chunk : rank_to_chunk[recv_rank]) {
452 |         recv_q.emplace(chunk, step + world_size_exp, to_rank);
453 |       }
454 |     }
455 | 
456 |     for (int send_rank = send_rank_start; send_rank < send_rank_end;
457 |          send_rank++) {
458 |       for (auto chunk : rank_to_chunk[send_rank]) {
459 |         send_q.emplace(chunk, step + world_size_exp, to_rank, true,
460 |                        step == (world_size_exp - 1));
461 |       }
462 |     }
463 | 
464 |     start_rank = std::min(send_rank_start, recv_rank_start);
465 |     end_rank = std::max(send_rank_end, recv_rank_end);
466 |   }
467 | 
468 |   TRACE(trace_other_);
469 | 
470 |   while (!recv_q.empty() || !wait_recv_q.empty() || !send_q.empty() ||
471 |          !wait_send_q.empty() || !wait_send_copy_q.empty() ||
472 |          !wait_reduction_q.empty() || !first_send_q.empty() ||
473 |          !first_send_q_buffering.empty()) {
474 |     while (wait_reduction_q.empty() && !wait_recv_q.empty() &&
475 |            RecvPoll(wait_recv_q.front().pair_rank_)) {
476 |       TRACE(trace_received_);
477 | 
478 |       auto received = wait_recv_q.front();
479 |       wait_recv_q.pop();
480 | 
481 |       auto range = ranges[received.range_id_];
482 |       size_t offset_elements = range.first;
483 |       size_t elements = (range.second - range.first);
484 |       size_t bytes = elements * sizeof(T);
485 | 
486 |       auto& mem = chunk_to_memory[received.range_id_];
487 | 
488 |       TRACE(trace_received_);
489 | 
490 |       if (received.reduce_) {
491 |         // Reduce-Scatter phase
492 | 
493 |         if (!first_memcpy_done) {
494 |           TRACE(trace_other_);
495 |           CUDACHECK(cudaStreamSynchronize(0));
496 |           first_memcpy_done = true;
497 |           TRACE(trace_other_);
498 |         }
499 | 
500 |         TRACE(trace_issue_redu_kernel_);
501 | 
502 |         // tmp_gpu_buffer <- mem
503 |         CUDACHECK(cudaMemcpyAsync(tmp_gpu_buffer_, mem->ptr(), bytes,
504 |                                   cudaMemcpyDefault, mem->stream()));
505 | 
506 |         const auto blocks =
507 |             std::min(util::ceilDiv(elements, (size_t)THREADS), (size_t)(65535));
508 | 
509 |         // recvbuf += tmp_gpu_buffer ( on GPU )
510 |         _reduce_inplace_cuda<<<blocks, THREADS, 0, mem->stream()>>>(
511 |             recvbuf + offset_elements, static_cast<T*>(tmp_gpu_buffer_),
512 |             elements);
513 | 
514 |         // mem <- recvbuf
515 |         CUDACHECK(cudaMemcpyAsync(mem->ptr(), recvbuf + offset_elements, bytes,
516 |                                   cudaMemcpyDefault, mem->stream()));
517 | 
518 |         received.depth_++;
519 |         wait_reduction_q.push(received);
520 | 
521 |         TRACE(trace_issue_redu_kernel_);
522 |       } else {
523 |         // AllGather phase
524 | 
525 |         TRACE(trace_issue_copy_kernel_);
526 | 
527 |         // recvbuf <- mem
528 |         CUDACHECK(cudaMemcpyAsync(recvbuf + offset_elements, mem->ptr(), bytes,
529 |                                   cudaMemcpyDefault, mem->stream()));
530 | 
531 |         reduced_chunk[received.range_id_] = true;
532 | 
533 |         TRACE(trace_issue_copy_kernel_);
534 |       }
535 |     }
536 | 
537 |     if (!wait_reduction_q.empty() &&
538 |         cudaStreamQuery(
539 |             chunk_to_memory[wait_reduction_q.front().range_id_]->stream()) ==
540 |             cudaSuccess) {
541 |       TRACE(trace_reduced_);
542 | 
543 |       auto reduced = wait_reduction_q.front();
544 |       wait_reduction_q.pop();
545 | 
546 |       auto range = ranges[reduced.range_id_];
547 |       size_t elements = (range.second - range.first);
548 |       size_t bytes = elements * sizeof(T);
549 | 
550 |       auto& mem = chunk_to_memory[reduced.range_id_];
551 | 
552 |       TRACE(trace_reduced_);
553 | 
554 |       if (!send_q.empty() && send_q.front().range_id_ == reduced.range_id_ &&
555 |           send_q.front().depth_ == reduced.depth_) {
556 |         auto send_range = send_q.front();
557 |         send_q.pop();
558 | 
559 |         if (first_send_q.empty() && wait_send_copy_q.empty() &&
560 |             first_send_q_buffering.empty()) {
561 |           TRACE(trace_issue_send_);
562 | 
563 |           SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes,
564 |                          false);
565 |           wait_send_q.push(send_range);
566 | 
567 |           TRACE(trace_issue_send_);
568 |         } else {
569 |           first_send_q_buffering.push(send_range);
570 |         }
571 | 
572 |         if (send_range.reduce_) {
573 |           // AllGather phase
574 |           reduced_chunk[send_range.range_id_] = true;
575 |         }
576 |       } else {
577 |         CUDACHECK(cudaStreamSynchronize(mem->stream()));
578 |         controller.returnMemory(mem);
579 |         mem = NULL;
580 |       }
581 |     }
582 | 
583 |     if (wait_recv_q.size() <= 2 && !recv_q.empty() &&
584 |         chunk_to_memory[recv_q.front().range_id_] == NULL) {
585 |       TRACE(trace_issue_recv_);
586 | 
587 |       auto recv_range = recv_q.front();
588 |       recv_q.pop();
589 | 
590 |       auto range = ranges[recv_range.range_id_];
591 |       size_t elements = (range.second - range.first);
592 |       size_t bytes = elements * sizeof(T);
593 | 
594 |       auto mem = chunk_to_memory[recv_range.range_id_] = controller.getMemory();
595 | 
596 |       RecvRegistered(recv_range.pair_rank_, mem->ptr(), mem->mr(), bytes,
597 |                      false);
598 |       wait_recv_q.push(recv_range);
599 | 
600 |       TRACE(trace_issue_recv_);
601 |     }
602 | 
603 |     while (first_send_q.empty() && wait_send_copy_q.empty() &&
604 |            first_send_q_buffering.empty() && !send_q.empty() &&
605 |            reduced_chunk[send_q.front().range_id_]) {
606 |       TRACE(trace_issue_send_);
607 | 
608 |       auto send_range = send_q.front();
609 |       send_q.pop();
610 | 
611 |       auto range = ranges[send_range.range_id_];
612 |       size_t elements = (range.second - range.first);
613 |       size_t bytes = elements * sizeof(T);
614 | 
615 |       auto mem = chunk_to_memory[send_range.range_id_];
616 | 
617 |       SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes,
618 |                      false);
619 |       wait_send_q.push(send_range);
620 | 
621 |       TRACE(trace_issue_send_);
622 |     }
623 | 
624 |     if (!first_send_q.empty()) {
625 |       TRACE(trace_issue_copy_kernel_);
626 | 
627 |       auto send_range = first_send_q.front();
628 |       first_send_q.pop();
629 | 
630 |       auto range = ranges[send_range.range_id_];
631 |       size_t offset_elements = range.first;
632 |       size_t elements = (range.second - range.first);
633 |       size_t bytes = elements * sizeof(T);
634 | 
635 |       auto mem = chunk_to_memory[send_range.range_id_] = controller.getMemory();
636 |       CUDACHECK(cudaMemcpyAsync(mem->ptr(), sendbuf + offset_elements, bytes,
637 |                                 cudaMemcpyDefault, mem->stream()));
638 | 
639 |       wait_send_copy_q.push(send_range);
640 | 
641 |       TRACE(trace_issue_copy_kernel_);
642 |     }
643 | 
644 |     if (!wait_send_copy_q.empty() &&
645 |         cudaStreamQuery(
646 |             chunk_to_memory[wait_send_copy_q.front().range_id_]->stream()) ==
647 |             cudaSuccess) {
648 |       TRACE(trace_issue_send_);
649 | 
650 |       auto send_range = wait_send_copy_q.front();
651 |       wait_send_copy_q.pop();
652 | 
653 |       auto range = ranges[send_range.range_id_];
654 |       size_t elements = (range.second - range.first);
655 |       size_t bytes = elements * sizeof(T);
656 | 
657 |       auto mem = chunk_to_memory[send_range.range_id_];
658 | 
659 |       SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes,
660 |                      false);
661 |       wait_send_q.push(send_range);
662 | 
663 |       TRACE(trace_issue_send_);
664 |     }
665 | 
666 |     while (first_send_q.empty() && wait_send_copy_q.empty() &&
667 |            !first_send_q_buffering.empty()) {
668 |       TRACE(trace_issue_send_);
669 | 
670 |       auto send_range = first_send_q_buffering.front();
671 |       first_send_q_buffering.pop();
672 | 
673 |       auto range = ranges[send_range.range_id_];
674 |       size_t elements = (range.second - range.first);
675 |       size_t bytes = elements * sizeof(T);
676 | 
677 |       auto mem = chunk_to_memory[send_range.range_id_];
678 | 
679 |       SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes,
680 |                      false);
681 |       wait_send_q.push(send_range);
682 | 
683 |       TRACE(trace_issue_send_);
684 |     }
685 | 
686 |     while (!wait_send_q.empty() && SendPoll(wait_send_q.front().pair_rank_)) {
687 |       TRACE(trace_other_);
688 | 
689 |       auto send_range = wait_send_q.front();
690 |       wait_send_q.pop();
691 | 
692 |       if (send_range.reduce_ && !send_range.last_) {
693 |         // AllGather phase and non-last AllGather send
694 |         // We need to send a chunk which is already sent,
695 |         // so we still hold data on CPU-memory.
696 |       } else {
697 |         auto& mem = chunk_to_memory[send_range.range_id_];
698 | 
699 |         CUDACHECK(cudaStreamSynchronize(mem->stream()));
700 |         controller.returnMemory(mem);
701 |         mem = NULL;
702 |       }
703 | 
704 |       TRACE(trace_other_);
705 |     }
706 |   }
707 | 
708 |   TRACE(trace_other_);
709 |   for (auto& mem : chunk_to_memory) {
710 |     if (mem != NULL) {
711 |       CUDACHECK(cudaStreamSynchronize(mem->stream()));
712 |       controller.returnMemory(mem);
713 |       mem = NULL;
714 |     }
715 |   }
716 |   TRACE(trace_other_);
717 | }
718 | 
719 | #endif
720 | 


--------------------------------------------------------------------------------
/ibcomm/allreduce_tester.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | // allreduce_tester.cpp
  3 | //
  4 | // A helper program for allreduce integration test.
  5 | //
  6 | // Usage:
  7 | //   $ mpiexec -n ${NP} ./allreduce_tester [algorithm] [buffer size] [init expr]
  8 | //   [check expr]
  9 | //
 10 | //      * [algorithm]    : Name of Allreduce Algorithm. "ring" and
 11 | //      "rabenseifner" is supported.
 12 | //
 13 | //      * [buffer size]  : Size of the target buffer. Suffix "k", "m", "g" are
 14 | //      allowed.
 15 | //                         ex.) 1024, 128M, 10k
 16 | //      * [init expr]    : Target bufffer is initialized with this expression in
 17 | //      an elementwise manner.
 18 | //                         For details of expressions, see
 19 | //                         https://github.com/codeplea/tinyexpr Additional
 20 | //                         variables are supported:
 21 | //                           - p  : Process rank
 22 | //                           - np : Number of processes (e.g. mpi_size)
 23 | //                           - n  : Number of elementso of the target buffer
 24 | //                           (NOT size in bytes)
 25 | //                           - nb : Size of the target buffer in bytes
 26 | //                           - i  : Index of the element in buffer
 27 | //                         ex.)
 28 | //                           (1) init_expr = "1", n = 4, np = 2
 29 | //                             Rank 0:  [1, 1, 1, 1]
 30 | //                             Rank 1:  [1, 1, 1, 1]
 31 | //                           (2) init_expr = "1/np*p+i", n = 4, np = 2
 32 | //                             Rank 0: [0.0, 1.0, 2.0, 3.0]   # [1/2*0+0,
 33 | //                             1/2*0+1, ...] Rank 1: [0.5, 1.5, 2.5, 3.5]   #
 34 | //                             [1/2*1+0, 1/2*1+1, ... ]
 35 | //
 36 | //      * [check expr]   : Target buffer is checked after Allreduce operation
 37 | //      using check expr.
 38 | //                         The grammar of expressions is identical to [init
 39 | //                         expr]
 40 | //
 41 | 
 42 | #include <cuda_runtime.h>
 43 | #include <mpi.h>
 44 | #include <thrust/device_vector.h>
 45 | #include <thrust/host_vector.h>
 46 | 
 47 | #include <algorithm>
 48 | #include <cassert>
 49 | #include <cctype>
 50 | #include <iostream>
 51 | #include <memory>
 52 | #include <sstream>
 53 | #include <tuple>
 54 | #include <vector>
 55 | 
 56 | #include "grumpi/grumpi.hpp"
 57 | #include "ibcomm/ibverbs_communicator.h"
 58 | #include "ibcomm/util.h"
 59 | #include "tinyexpr/tinyexpr.h"
 60 | 
 61 | class TinyExpr {
 62 |   std::vector<te_variable> vars_;
 63 |   te_expr *expr_;
 64 | 
 65 |  public:
 66 |   TinyExpr() : vars_(), expr_(nullptr) {}
 67 | 
 68 |   void set_variable(const char *name, const void *address, int type,
 69 |                     void *context) {
 70 |     te_variable va;
 71 |     va.name = name;
 72 |     va.address = address;
 73 |     va.type = type;
 74 |     va.context = context;
 75 |     vars_.push_back(va);
 76 |   }
 77 | 
 78 |   void compile(const std::string &expr) { compile(expr.c_str()); }
 79 | 
 80 |   void compile(const char *expr) {
 81 |     int err;
 82 |     expr_ =
 83 |         te_compile(expr, vars_.data(), static_cast<int>(vars_.size()), &err);
 84 | 
 85 |     if (!expr_) {
 86 |       std::stringstream ss;
 87 |       ss << "Invalid expression: '" << expr << "'";
 88 |       throw std::runtime_error(ss.str());
 89 |     }
 90 |   }
 91 | 
 92 |   double eval() {
 93 |     if (!expr_) {
 94 |       throw std::runtime_error("Expression must be compiled before eval()");
 95 |     }
 96 |     return te_eval(expr_);
 97 |   }
 98 | };
 99 | 
100 | class Communicator {
101 |   MPI_Comm mpi_comm_;
102 |   int size_;
103 |   int rank_;
104 |   std::unique_ptr<IBVerbsCommunicator> ibcomm_;
105 | 
106 |  public:
107 |   explicit Communicator(MPI_Comm comm = MPI_COMM_WORLD) : mpi_comm_(comm) {
108 |     MPI_Comm_rank(mpi_comm_, &rank_);
109 |     MPI_Comm_size(mpi_comm_, &size_);
110 |     ibcomm_.reset(new IBVerbsCommunicator(size_));
111 | 
112 |     std::vector<uint32_t> qps(size_ * 3);
113 | 
114 |     for (int i = 0; i < size_; i++) {
115 |       if (i == rank_) {
116 |         continue;
117 |       }
118 |       ProcessInfo pinfo = ibcomm_->CreateQueuePair(i);
119 |       qps[i * 3 + 0] = pinfo.lid;
120 |       qps[i * 3 + 1] = pinfo.qp_n;
121 |       qps[i * 3 + 2] = pinfo.psn;
122 |     }
123 | 
124 |     MPI_Alltoall(MPI_IN_PLACE, 3, MPI_UINT32_T, qps.data(), 3, MPI_UINT32_T,
125 |                  comm);
126 | 
127 |     for (int i = 0; i < size_; i++) {
128 |       if (i == rank_) {
129 |         ibcomm_->RegisterMyself(i);
130 |       } else {
131 |         ProcessInfo pinfo;
132 |         pinfo.lid = qps[i * 3 + 0];
133 |         pinfo.qp_n = qps[i * 3 + 1];
134 |         pinfo.psn = qps[i * 3 + 2];
135 |         ibcomm_->RegisterQueuePair(i, pinfo);
136 |       }
137 |     }
138 |   }
139 | 
140 |   void die(const std::string &errmsg, int retcode = 1) {
141 |     if (rank_ == 0) {
142 |       std::cerr << errmsg << std::endl;
143 |     }
144 |     exit(retcode);
145 |   }
146 | 
147 |   template <class T>
148 |   void allreduce(const std::string &algorithm_type,
149 |                  const thrust::device_vector<T> &sendbuf_d,
150 |                  thrust::device_vector<T> *recvbuf_d) {
151 |     if (recvbuf_d == nullptr)
152 |       util::IbcommError(__FILE__, __LINE__,
153 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
154 |                         "recvbuf_d is nullptr.");
155 | 
156 |     if (algorithm_type == "ring") {
157 |       ibcomm_->AllreduceRingCuda(sendbuf_d.data().get(),
158 |                                  recvbuf_d->data().get(), sendbuf_d.size());
159 |     } else if (algorithm_type == "rabenseifner") {
160 |       ibcomm_->AllreduceRabenseifnerCuda(
161 |           sendbuf_d.data().get(), recvbuf_d->data().get(), sendbuf_d.size());
162 |     } else {
163 |       die("Error: Unsupported algorithm");
164 |     }
165 |   }
166 | };
167 | 
168 | /**
169 |  * Application main class
170 |  */
171 | template <class T>
172 | class AllreduceTester {
173 |  public:
174 |   using ElemType = T;
175 | 
176 |  private:
177 |   MPI_Comm comm_;
178 |   int mpi_rank_;
179 |   int mpi_size_;
180 | 
181 |   Communicator ibcomm_;
182 | 
183 |   // target array size
184 |   size_t array_nbytes_;  // array size in bytes
185 |   size_t num_elems_;     // array length
186 | 
187 |   double var_p;   // "p"  variable in expressions (process rank)
188 |   double var_np;  // "np" variable in expressions (number of processes, i.e.
189 |                   // mpi_size)
190 |   double var_n;   // "n"  variable in expressions (number of elements in buffer)
191 |   double var_nb;  // "nb" variable in expressions (size of buffer in bytes)
192 | 
193 |   std::unique_ptr<TinyExpr> init_expr_;
194 |   std::unique_ptr<TinyExpr> check_expr_;
195 | 
196 |   void usage(int argc, char **argv) {
197 |     if (mpi_rank_ == 0) {
198 |       std::cerr << "Usage: " << argv[0] << " "
199 |                 << "[algorithm] "
200 |                 << "[array size (Bytes)] "
201 |                 << "[init expr] "
202 |                 << "[check expr]" << std::endl;
203 |     }
204 |     exit(-1);
205 |   }
206 | 
207 |   void die(const std::string &errmsg, int retcode = 1) {
208 |     if (mpi_rank_ == 0) {
209 |       std::cerr << errmsg << std::endl;
210 |     }
211 |     exit(retcode);
212 |   }
213 | 
214 |   size_t parse_nbytes(const char *src) {
215 |     size_t i = 0;
216 |     size_t n = 0;
217 | 
218 |     while (std::isdigit(src[i])) {
219 |       n = n * 10 + (src[i] - '0');
220 |       i++;
221 |     }
222 | 
223 |     if (src[i] != 0) {
224 |       switch (src[i]) {
225 |         case 'k':
226 |         case 'K':
227 |           n *= 1024;
228 |           break;
229 |         case 'm':
230 |         case 'M':
231 |           n *= 1024 * 1024;
232 |           break;
233 |         case 'g':
234 |         case 'G':
235 |           n *= 1024 * 1024 * 1024;
236 |           break;
237 |         default:
238 |           std::stringstream ss;
239 |           ss << "Cannot parse an array size: '" << src << "'" << std::endl;
240 |           die(ss.str());
241 |       }
242 |     }
243 | 
244 |     i++;
245 | 
246 |     return n;
247 |   }
248 | 
249 |   std::tuple<std::string, size_t, std::string, std::string> parse_args(
250 |       int argc, char **argv) {
251 |     if (argc != 5) {
252 |       usage(argc, argv);
253 |     }
254 |     // Parse argument 1
255 |     std::string algorithm = argv[1];
256 | 
257 |     // Parse argument 2 (array length (bytes))
258 |     size_t nbytes = parse_nbytes(argv[2]);
259 | 
260 |     // Parse argument 3 (initializing expression)
261 |     std::string init_expr = argv[3];
262 | 
263 |     std::string check_expr = argv[4];
264 | 
265 |     return std::make_tuple(algorithm, nbytes, init_expr, check_expr);
266 |   }
267 | 
268 |   void setup_sendbuf(thrust::host_vector<T> *buf,
269 |                      const std::string &init_expr_str) {
270 |     if (buf == nullptr)
271 |       util::IbcommError(__FILE__, __LINE__,
272 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
273 |                         "buf is nullptr.");
274 | 
275 |     TinyExpr expr;
276 | 
277 |     expr.set_variable("p", &var_p, TE_VARIABLE, nullptr);
278 |     expr.set_variable("np", &var_np, TE_VARIABLE, nullptr);
279 |     expr.set_variable("n", &var_n, TE_VARIABLE, nullptr);
280 |     expr.set_variable("nb", &var_nb, TE_VARIABLE, nullptr);
281 | 
282 |     for (size_t i = 0; i < buf->size(); i++) {
283 |       double var_i = i;
284 |       expr.set_variable("i", &var_i, TE_VARIABLE, nullptr);
285 |       expr.compile(init_expr_str);
286 |       (*buf)[i] = expr.eval();
287 |     }
288 |   }
289 | 
290 |   std::tuple<bool, std::vector<std::string>> check_recvbuf(
291 |       const thrust::host_vector<T> &buf, const std::string &check_expr_str) {
292 |     constexpr double eps = 1e-12;
293 | 
294 |     std::vector<std::string> msgs;
295 |     bool check_ok = true;
296 | 
297 |     TinyExpr expr;
298 | 
299 |     expr.set_variable("p", &var_p, TE_VARIABLE, nullptr);
300 |     expr.set_variable("np", &var_np, TE_VARIABLE, nullptr);
301 |     expr.set_variable("n", &var_n, TE_VARIABLE, nullptr);
302 |     expr.set_variable("nb", &var_nb, TE_VARIABLE, nullptr);
303 | 
304 |     for (size_t i = 0; i < buf.size(); i++) {
305 |       double var_i = i;
306 |       expr.set_variable("i", &var_i, TE_VARIABLE, nullptr);
307 |       expr.compile(check_expr_str);
308 | 
309 |       double res = buf[i];
310 |       double ans = expr.eval();
311 | 
312 |       if (std::abs(res - ans) > eps) {
313 |         std::stringstream ss;
314 |         ss << "Error: Element [" << i << "] must be " << ans
315 |            << " but actually is " << res;
316 |         msgs.push_back(ss.str());
317 |         check_ok = false;
318 |       }
319 |     }
320 | 
321 |     return std::make_tuple(check_ok, msgs);
322 |   }
323 | 
324 |   void report_errors(bool check_ok, const std::vector<std::string> &msgs) {
325 |     for (int i = 0; i < mpi_size_; i++) {
326 |       if (i == mpi_rank_) {
327 |         if (!check_ok) {
328 |           size_t report_num = std::min(msgs.size(), (size_t)1000);
329 |           for (size_t i = 0; i < report_num; i++) {
330 |             std::cerr << "[Rank " << mpi_rank_ << "] " << msgs[i] << std::endl;
331 |           }
332 |         }
333 |       }
334 |       MPI_Barrier(MPI_COMM_WORLD);
335 |     }
336 |   }
337 | 
338 |   thrust::host_vector<ElemType> run_allreduce(
339 |       const std::string &algorithm_type,
340 |       const thrust::host_vector<ElemType> &sendbuf) {
341 |     thrust::host_vector<ElemType> recvbuf(num_elems_);
342 |     thrust::device_vector<ElemType> recvbuf_d(num_elems_);
343 |     thrust::device_vector<ElemType> sendbuf_d(num_elems_);
344 | 
345 |     sendbuf_d = sendbuf;
346 | 
347 |     ibcomm_.allreduce(algorithm_type, sendbuf_d, &recvbuf_d);
348 | 
349 |     recvbuf = recvbuf_d;
350 |     return recvbuf;
351 |   }
352 | 
353 |  public:
354 |   explicit AllreduceTester(MPI_Comm comm) : comm_(comm), ibcomm_(comm_) {
355 |     MPI_Comm_size(comm_, &mpi_size_);
356 |     MPI_Comm_rank(comm_, &mpi_rank_);
357 |   }
358 | 
359 |   // Check allreduce
360 |   int run(int argc, char **argv) {
361 |     std::string algorithm_type, init_expr_str, check_expr_str;
362 |     std::tie(algorithm_type, array_nbytes_, init_expr_str, check_expr_str) =
363 |         parse_args(argc, argv);
364 | 
365 |     std::vector<std::string> supported_algorithms = {"ring", "rabenseifner"};
366 | 
367 |     if (std::find(supported_algorithms.begin(), supported_algorithms.end(),
368 |                   algorithm_type) == supported_algorithms.end()) {
369 |       std::stringstream ss;
370 |       ss << "Error: Unsupported algorithm " << algorithm_type << std::endl;
371 | 
372 |       ss << "Supported algorithms: ";
373 |       for (auto algo : supported_algorithms) {
374 |         ss << algo << ", ";
375 |       }
376 |       die(ss.str());
377 |     }
378 | 
379 |     if (array_nbytes_ < sizeof(ElemType)) {
380 |       if (mpi_rank_ == 0) {
381 |         std::cerr << "Warning: specified array size is "
382 |                   << "smaller than the element size(" << sizeof(ElemType)
383 |                   << "). "
384 |                   << "Ceiling it up to " << sizeof(ElemType) << " [bytes]"
385 |                   << std::endl;
386 |       }
387 |       array_nbytes_ = sizeof(ElemType);
388 |     }
389 | 
390 |     num_elems_ = array_nbytes_ / sizeof(ElemType);
391 |     var_np = mpi_size_;
392 |     var_p = mpi_rank_;
393 |     var_n = array_nbytes_ / sizeof(ElemType);
394 |     var_nb = array_nbytes_;
395 | 
396 |     thrust::host_vector<ElemType> sendbuf(num_elems_);
397 |     setup_sendbuf(&sendbuf, init_expr_str);
398 | 
399 |     auto recvbuf = run_allreduce(algorithm_type, sendbuf);
400 | 
401 |     bool check_ok;
402 |     std::vector<std::string> msgs;
403 |     std::tie(check_ok, msgs) = check_recvbuf(recvbuf, check_expr_str);
404 | 
405 |     report_errors(check_ok, msgs);
406 | 
407 |     int status_all = (check_ok ? 0 : 1);
408 |     MPI_Allreduce(&status_all, &status_all, 1, MPI_INT, MPI_SUM,
409 |                   MPI_COMM_WORLD);
410 | 
411 |     return status_all;
412 |   }
413 | };
414 | 
415 | int main(int argc, char **argv) {
416 |   using ElemType = int;
417 | 
418 |   MPI_Init(&argc, &argv);
419 | 
420 |   int ngpus = -1;
421 |   CUDACHECK(cudaGetDeviceCount(&ngpus));
422 | 
423 |   int intra_rank;
424 |   grumpi::Comm_local_rank(MPI_COMM_WORLD, &intra_rank);
425 | 
426 |   CUDACHECK(cudaSetDevice(intra_rank % ngpus));
427 | 
428 |   AllreduceTester<ElemType> tester(MPI_COMM_WORLD);
429 | 
430 |   int status = tester.run(argc, argv);
431 | 
432 |   MPI_Finalize();
433 | 
434 |   return status;
435 | }
436 | 


--------------------------------------------------------------------------------
/ibcomm/ibverbs_communicator.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #include "ibcomm/ibverbs_communicator.h"
  4 | 
  5 | #include <sys/time.h>
  6 | 
  7 | #include <cassert>
  8 | #include <cerrno>
  9 | #include <cstdlib>
 10 | 
 11 | #include <algorithm>
 12 | #include <fstream>
 13 | #include <iostream>
 14 | #include <sstream>
 15 | #include <utility>
 16 | 
 17 | #include "ibcomm/util.h"
 18 | 
 19 | #ifdef USE_CUDA
 20 | #include <cuda.h>
 21 | #include <cuda_runtime.h>
 22 | #endif
 23 | 
 24 | IBVerbsCommunicator::IBVerbsCommunicator() {}
 25 | IBVerbsCommunicator::IBVerbsCommunicator(int world_size) { Init(world_size); }
 26 | 
 27 | void IBVerbsCommunicator::Init(int world_size) {
 28 |   if (initialized_) {
 29 |     util::IbcommWarning(__FILE__, __LINE__,
 30 |                         "IBVerbsCommunicator is already initialized.");
 31 |     return;
 32 |   }
 33 | 
 34 |   int ret = ibv_fork_init();
 35 |   if (ret) {
 36 |     int errno_backup = errno;
 37 |     util::IbcommWarning(__FILE__, __LINE__, "Failure: ibv_fork_init (errno=%d)",
 38 |                         errno_backup);
 39 |   }
 40 | 
 41 |   int devices;
 42 |   dev_list_ = ibv_get_device_list(&devices);
 43 | 
 44 |   if (!dev_list_) {
 45 |     int errno_backup = errno;
 46 |     util::IbcommError(__FILE__, __LINE__,
 47 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
 48 |                       "Failure: ibv_get_device_list (errno=%d)", errno_backup);
 49 |   }
 50 | 
 51 |   for (int i = 0; i < devices; i++) {
 52 |     ibv_device* device = dev_list_[i];
 53 | 
 54 |     if (!device) {
 55 |       continue;
 56 |     }
 57 | 
 58 |     context_ = ibv_open_device(device);
 59 | 
 60 |     if (!context_) {
 61 |       continue;
 62 |     }
 63 |   }
 64 | 
 65 |   if (!context_) {
 66 |     util::IbcommError(__FILE__, __LINE__,
 67 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
 68 |                       "Failure: No HCA can use");
 69 |   }
 70 | 
 71 |   ret = ibv_query_port(context_, 1, &port_attr_);
 72 | 
 73 |   if (ret != 0 || port_attr_.lid == 0) {
 74 |     // error handling
 75 |     util::IbcommError(__FILE__, __LINE__,
 76 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
 77 |                       "Failure: ibv_query_port");
 78 |   }
 79 | 
 80 |   pd_ = ibv_alloc_pd(context_);
 81 | 
 82 |   if (!pd_) {
 83 |     util::IbcommError(__FILE__, __LINE__,
 84 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
 85 |                       "Failure: ibv_alloc_pd");
 86 |   }
 87 | 
 88 |   world_size_ = world_size;
 89 |   pq_world_ = std::vector<ProcessQueue>(world_size_);
 90 |   psn_world_ = std::vector<uint32_t>(world_size_);
 91 |   mr_world_ = std::vector<std::pair<struct ibv_mr*, struct ibv_mr*>>(
 92 |       world_size_, std::pair<struct ibv_mr*, struct ibv_mr*>(NULL, NULL));
 93 | 
 94 | #ifdef USE_CUDA
 95 |   PrepareMemoryPool();
 96 | #endif
 97 | 
 98 |   initialized_ = true;
 99 | }
100 | 
101 | IBVerbsCommunicator::~IBVerbsCommunicator() {
102 |   // release queues
103 |   for (size_t i = 0; i < pq_world_.size(); i++) {
104 |     if (pq_world_[i].queue_pair != NULL)
105 |       ibv_destroy_qp(pq_world_[i].queue_pair);
106 | 
107 |     if (pq_world_[i].recv_complete_queue != NULL)
108 |       ibv_destroy_cq(pq_world_[i].recv_complete_queue);
109 | 
110 |     if (pq_world_[i].send_complete_queue != NULL)
111 |       ibv_destroy_cq(pq_world_[i].send_complete_queue);
112 |   }
113 | 
114 |   // release memory region which is nonblocking-io but not freed.
115 |   for (size_t i = 0; i < mr_world_.size(); i++) {
116 |     // send
117 |     if (mr_world_[i].first != NULL) {
118 |       ibv_dereg_mr(mr_world_[i].first);
119 |       mr_world_[i].first = NULL;
120 |     }
121 | 
122 |     // recv
123 |     if (mr_world_[i].second != NULL) {
124 |       ibv_dereg_mr(mr_world_[i].second);
125 |       mr_world_[i].second = NULL;
126 |     }
127 |   }
128 | 
129 | #ifdef USE_CUDA
130 |   pool_.reset();
131 | #endif
132 | 
133 |   if (pd_ != NULL) {
134 |     ibv_dealloc_pd(pd_);
135 |   }
136 | 
137 |   if (context_ != NULL) {
138 |     ibv_close_device(context_);
139 |   }
140 | 
141 |   if (dev_list_ != NULL) {
142 |     ibv_free_device_list(dev_list_);
143 |   }
144 | 
145 | #ifdef USE_TRACE
146 |   DumpTrace();
147 | #endif
148 | 
149 | #ifdef USE_CUDA
150 |   if (tmp_gpu_buffer_ != NULL) {
151 |     CUDACHECK(cudaFree(tmp_gpu_buffer_));
152 |     tmp_gpu_buffer_ = NULL;
153 |   }
154 | #endif
155 | }
156 | 
157 | namespace {
158 | double timeDiffMillis(const struct timespec& t1, const struct timespec& t2) {
159 |   return (t2.tv_sec - t1.tv_sec) * 1e3 + (t2.tv_nsec - t1.tv_nsec) * 1e-6;
160 | }
161 | 
162 | void DumpTraceFromVector(std::ofstream& stream, struct timespec origin,
163 |                          const std::vector<struct timespec>& vector) {
164 |   for (int i = 0; i < vector.size(); i += 2) {
165 |     stream << timeDiffMillis(origin, vector[i]) << ",";
166 |     stream << timeDiffMillis(vector[i], vector[i + 1]) << ",";
167 |   }
168 | }
169 | }  // namespace
170 | 
171 | void IBVerbsCommunicator::DumpTrace() const {
172 |   std::stringstream ss;
173 |   const char* base = getenv("IBCOMM_TRACE_FILE");
174 |   base = base ? base : "ibcomm_trace";
175 |   ss << base << "_" << my_rank_ << ".dat";
176 |   std::ofstream trace_log;
177 |   trace_log.open(ss.str().c_str());
178 | 
179 |   if (!trace_log.good()) {
180 |     std::cerr << "ERROR: ofstream open failed" << std::endl;
181 |   } else {
182 |     trace_log << std::scientific;
183 | 
184 |     trace_log << "received,";
185 |     DumpTraceFromVector(trace_log, trace_start_, trace_received_);
186 |     trace_log << std::endl;
187 | 
188 |     trace_log << "reduced,";
189 |     DumpTraceFromVector(trace_log, trace_start_, trace_reduced_);
190 |     trace_log << std::endl;
191 | 
192 |     trace_log << "issue-send,";
193 |     DumpTraceFromVector(trace_log, trace_start_, trace_issue_send_);
194 |     trace_log << std::endl;
195 | 
196 |     trace_log << "issue-copy-kernel,";
197 |     DumpTraceFromVector(trace_log, trace_start_, trace_issue_copy_kernel_);
198 |     trace_log << std::endl;
199 | 
200 |     trace_log << "issue-redu-kernel,";
201 |     DumpTraceFromVector(trace_log, trace_start_, trace_issue_redu_kernel_);
202 |     trace_log << std::endl;
203 | 
204 |     trace_log << "issue-recv,";
205 |     DumpTraceFromVector(trace_log, trace_start_, trace_issue_recv_);
206 |     trace_log << std::endl;
207 | 
208 |     trace_log << "other,";
209 |     DumpTraceFromVector(trace_log, trace_start_, trace_other_);
210 |     trace_log << std::endl;
211 |   }
212 | }
213 | 
214 | void IBVerbsCommunicator::SetTimerBase() {
215 |   clock_gettime(CLOCK_MONOTONIC_RAW, &trace_start_);
216 | }
217 | 
218 | namespace {
219 | void modify_qp(struct ibv_qp* qp, uint32_t src_psn, uint16_t dest_lid,
220 |                uint32_t dest_pqn, uint32_t dest_psn) {
221 |   int ret;
222 | 
223 |   struct ibv_qp_attr init_attr = {};
224 |   init_attr.qp_state = IBV_QPS_INIT;
225 |   init_attr.port_num = 1;
226 |   init_attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE;
227 | 
228 |   ret = ibv_modify_qp(
229 |       qp, &init_attr,
230 |       IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS);
231 |   if (ret != 0) {
232 |     util::IbcommError(__FILE__, __LINE__,
233 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
234 |                       "Failure: ibv_modify_qp(1)");
235 |   }
236 | 
237 |   struct ibv_qp_attr rtr_attr = {};
238 |   rtr_attr.qp_state = IBV_QPS_RTR;
239 |   rtr_attr.path_mtu = IBV_MTU_4096;
240 |   rtr_attr.dest_qp_num = dest_pqn;
241 |   rtr_attr.rq_psn = dest_psn;
242 |   rtr_attr.max_dest_rd_atomic = 0;
243 | 
244 |   // retry_speed faster
245 |   rtr_attr.min_rnr_timer = 1;
246 | 
247 |   // retry_speed slower
248 |   // rtr_attr.min_rnr_timer = 0;
249 | 
250 |   rtr_attr.ah_attr.is_global = 0;
251 |   rtr_attr.ah_attr.dlid = dest_lid;
252 |   rtr_attr.ah_attr.sl = 0;
253 |   rtr_attr.ah_attr.src_path_bits = 0;
254 |   rtr_attr.ah_attr.port_num = 1;
255 | 
256 |   ret = ibv_modify_qp(qp, &rtr_attr,
257 |                       IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
258 |                           IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
259 |                           IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
260 |   if (ret != 0) {
261 |     util::IbcommError(__FILE__, __LINE__,
262 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
263 |                       "Failure: ibv_modify_qp(2)");
264 |   }
265 | 
266 |   struct ibv_qp_attr rts_attr = {};
267 |   rts_attr.qp_state = IBV_QPS_RTS;
268 |   rts_attr.timeout = 1;
269 |   rts_attr.retry_cnt = 7;
270 |   rts_attr.rnr_retry = 7;
271 |   rts_attr.sq_psn = src_psn;
272 |   rts_attr.max_rd_atomic = 0;
273 | 
274 |   ret = ibv_modify_qp(qp, &rts_attr,
275 |                       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
276 |                           IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
277 |                           IBV_QP_MAX_QP_RD_ATOMIC);
278 |   if (ret != 0) {
279 |     util::IbcommError(__FILE__, __LINE__,
280 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
281 |                       "Failure: ibv_modify_qp(3)");
282 |   }
283 | }
284 | }  // namespace
285 | 
286 | struct ProcessInfo IBVerbsCommunicator::RegisterProcess(
287 |     int dest_rank, struct ProcessInfo pinfo) {
288 |   struct ProcessInfo my_pinfo = CreateQueuePair(dest_rank);
289 | 
290 |   modify_qp(pq_world_[dest_rank].queue_pair, psn_world_[dest_rank], pinfo.lid,
291 |             pinfo.qp_n, pinfo.psn);
292 | 
293 |   return my_pinfo;
294 | }
295 | 
296 | struct ProcessInfo IBVerbsCommunicator::CreateQueuePair(int dest_rank) {
297 |   ibv_cq *send_complete_queue, *recv_complete_queue;
298 | 
299 |   send_complete_queue = ibv_create_cq(context_, 1024 * 1024, NULL, NULL, 0);
300 |   recv_complete_queue = ibv_create_cq(context_, 1024 * 1024, NULL, NULL, 0);
301 | 
302 |   if (!send_complete_queue) {
303 |     util::IbcommError(__FILE__, __LINE__,
304 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
305 |                       "Failure: ibv_create_cq of send cq");
306 |   }
307 | 
308 |   if (!recv_complete_queue) {
309 |     util::IbcommError(__FILE__, __LINE__,
310 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
311 |                       "Failure: ibv_create_cq of recv cq");
312 |   }
313 | 
314 |   uint32_t my_psn = random() % 0xFFFFFF;
315 |   psn_world_[dest_rank] = my_psn;
316 | 
317 |   struct ibv_qp_init_attr qp_init_attr = {};
318 |   qp_init_attr.qp_type = IBV_QPT_RC;
319 |   qp_init_attr.send_cq = send_complete_queue;
320 |   qp_init_attr.recv_cq = recv_complete_queue;
321 |   qp_init_attr.cap.max_send_wr = 8192;
322 |   qp_init_attr.cap.max_recv_wr = 8192;
323 |   qp_init_attr.cap.max_send_sge = 1;
324 |   qp_init_attr.cap.max_recv_sge = 1;
325 |   qp_init_attr.sq_sig_all = 1;
326 | 
327 |   struct ibv_qp* queue_pair;
328 |   queue_pair = ibv_create_qp(pd_, &qp_init_attr);
329 | 
330 |   if (!queue_pair) {
331 |     util::IbcommError(__FILE__, __LINE__,
332 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
333 |                       "Failure: ibv_create_cq");
334 |   }
335 | 
336 |   pq_world_[dest_rank] =
337 |       ProcessQueue(send_complete_queue, recv_complete_queue, queue_pair);
338 | 
339 |   struct ProcessInfo my_pinfo = {};
340 |   my_pinfo.lid = port_attr_.lid;
341 |   my_pinfo.psn = my_psn;
342 |   my_pinfo.qp_n = queue_pair->qp_num;
343 | 
344 |   return my_pinfo;
345 | }
346 | 
347 | void IBVerbsCommunicator::RegisterQueuePair(int dest_rank,
348 |                                             struct ProcessInfo pinfo) {
349 |   const auto& pqueue = pq_world_[dest_rank];
350 | 
351 |   modify_qp(pqueue.queue_pair, psn_world_[dest_rank], pinfo.lid, pinfo.qp_n,
352 |             pinfo.psn);
353 | }
354 | 
355 | void IBVerbsCommunicator::RegisterMyself(int my_rank) {
356 |   this->my_rank_ = my_rank;
357 | }
358 | 
359 | struct ibv_mr* IBVerbsCommunicator::RegisterSendBuf(const void* buf,
360 |                                                     size_t len) {
361 |   struct ibv_mr* mr_buf = ibv_reg_mr(pd_, const_cast<void*>(buf), len, 0);
362 |   if (mr_buf == 0) {
363 |     util::IbcommError(__FILE__, __LINE__,
364 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
365 |                       "Failure: ibv_reg_mr on send");
366 |   }
367 | 
368 |   return mr_buf;
369 | }
370 | 
371 | void IBVerbsCommunicator::Send(int dest_rank, const void* buf, size_t len,
372 |                                bool blocking) {
373 |   auto& save = mr_world_[dest_rank].first;
374 | 
375 |   if (save != NULL) {
376 |     util::IbcommError(__FILE__, __LINE__,
377 |                       util::IBCOMM_ERROR_CODE::NOT_SUPPORTED,
378 |                       "SendWait must be called before next non-blocking send.");
379 |   }
380 | 
381 |   save = RegisterSendBuf(buf, len);
382 | 
383 |   SendRegistered(dest_rank, buf, save, len, blocking);
384 | }
385 | 
386 | void IBVerbsCommunicator::SendRegistered(int dest_rank, const void* buf,
387 |                                          struct ibv_mr* mr_buf, size_t len,
388 |                                          bool blocking) {
389 |   int ret;
390 |   struct ibv_sge sge = {};
391 |   sge.addr = (uint64_t)(uintptr_t)buf;
392 |   sge.length = len;
393 |   sge.lkey = mr_buf->lkey;
394 | 
395 |   struct ibv_send_wr send_wr = {};
396 |   send_wr.wr_id = (uint64_t)(uintptr_t)buf;
397 |   send_wr.sg_list = &sge;
398 |   send_wr.num_sge = 1;
399 |   send_wr.opcode = IBV_WR_SEND;
400 | 
401 |   const auto& pq = pq_world_[dest_rank];
402 | 
403 |   struct ibv_send_wr* bad_wr;
404 |   ret = ibv_post_send(pq.queue_pair, &send_wr, &bad_wr);
405 |   if (ret != 0) {
406 |     util::IbcommError(__FILE__, __LINE__,
407 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
408 |                       "Failure: ibv_post_send");
409 |   }
410 | 
411 |   if (blocking) {
412 |     SendWait(dest_rank);
413 |   }
414 | }
415 | 
416 | bool IBVerbsCommunicator::SendPoll(int dest_rank) {
417 |   int ret;
418 |   const auto& pq = pq_world_[dest_rank];
419 |   struct ibv_wc wc = {};
420 |   bool ok = false;
421 | 
422 |   ret = ibv_poll_cq(pq.send_complete_queue, 1, &wc);
423 |   if (ret == 0) return false;
424 | 
425 |   if (ret < 0) {
426 |     util::IbcommError(__FILE__, __LINE__,
427 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
428 |                       "Failure: ibv_poll_cq");
429 |   }
430 | 
431 |   if (wc.status != IBV_WC_SUCCESS && wc.status == IBV_WC_LOC_PROT_ERR) {
432 |     util::IbcommError(__FILE__, __LINE__,
433 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
434 |                       "Failure: send completion error %d", wc.status);
435 |   }
436 | 
437 |   switch (wc.opcode) {
438 |     case IBV_WC_SEND:
439 |       ok = true;
440 | 
441 |       break;
442 | 
443 |     default:
444 |       util::IbcommError(__FILE__, __LINE__,
445 |                         util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
446 |                         "Failure: SendPoll %d", wc.opcode);
447 |   }
448 | 
449 |   if (ok) {
450 |     // unregister memory region from non-blocking io wait list
451 |     auto& save = mr_world_[dest_rank].first;
452 |     if (save != NULL) {
453 |       ibv_dereg_mr(save);
454 |       save = NULL;
455 |     }
456 | 
457 |     return true;
458 |   }
459 | 
460 |   return false;
461 | }
462 | 
463 | void IBVerbsCommunicator::SendWait(int dest_rank) {
464 |   while (!SendPoll(dest_rank)) {
465 |   }
466 | 
467 |   // unregister memory region from non-blocking io wait list
468 |   auto& save = mr_world_[dest_rank].first;
469 |   if (save != NULL) {
470 |     ibv_dereg_mr(save);
471 |     save = NULL;
472 |   }
473 | }
474 | 
475 | struct ibv_mr* IBVerbsCommunicator::RegisterRecvBuf(void* buf, size_t len) {
476 |   struct ibv_mr* mr_buf = ibv_reg_mr(pd_, buf, len, IBV_ACCESS_LOCAL_WRITE);
477 |   if (mr_buf == 0) {
478 |     util::IbcommError(__FILE__, __LINE__,
479 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
480 |                       "Failure: ibv_reg_mr on recv");
481 |   }
482 | 
483 |   return mr_buf;
484 | }
485 | 
486 | void IBVerbsCommunicator::Recv(int src_rank, void* buf, size_t len,
487 |                                bool blocking) {
488 |   auto& save = mr_world_[src_rank].second;
489 | 
490 |   if (save != NULL) {
491 |     util::IbcommError(__FILE__, __LINE__,
492 |                       util::IBCOMM_ERROR_CODE::NOT_SUPPORTED,
493 |                       "RecvWait must be called before next non-blocking send.");
494 |   }
495 | 
496 |   save = RegisterRecvBuf(buf, len);
497 | 
498 |   RecvRegistered(src_rank, buf, save, len, blocking);
499 | }
500 | 
501 | void IBVerbsCommunicator::RecvRegistered(int src_rank, const void* buf,
502 |                                          struct ibv_mr* mr_buf, size_t len,
503 |                                          bool blocking) {
504 |   struct ibv_sge sge = {};
505 |   sge.addr = (uint64_t)(uintptr_t)buf;
506 |   sge.length = len;
507 |   sge.lkey = mr_buf->lkey;
508 | 
509 |   struct ibv_recv_wr recv_wr = {};
510 |   recv_wr.wr_id = (uint64_t)(uintptr_t)buf;
511 |   recv_wr.sg_list = &sge;
512 |   recv_wr.num_sge = 1;
513 | 
514 |   const auto& pq = pq_world_[src_rank];
515 | 
516 |   struct ibv_recv_wr* bad_wr;
517 |   int ret = ibv_post_recv(pq.queue_pair, &recv_wr, &bad_wr);
518 |   if (ret != 0) {
519 |     util::IbcommError(__FILE__, __LINE__,
520 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
521 |                       "Failure: ibv_post_recv");
522 |   }
523 | 
524 |   if (blocking) {
525 |     RecvWait(src_rank);
526 |   }
527 | }
528 | 
529 | bool IBVerbsCommunicator::RecvPoll(int src_rank) {
530 |   int ret;
531 |   const auto& pq = pq_world_[src_rank];
532 |   struct ibv_wc wc = {};
533 |   bool ok = false;
534 | 
535 |   ret = ibv_poll_cq(pq.recv_complete_queue, 1, &wc);
536 |   if (ret == 0) return false;
537 | 
538 |   if (ret < 0) {
539 |     util::IbcommError(__FILE__, __LINE__,
540 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
541 |                       "Failure: ibv_poll_cq");
542 |   }
543 | 
544 |   if (wc.status != IBV_WC_SUCCESS && wc.status == IBV_WC_LOC_PROT_ERR) {
545 |     util::IbcommError(__FILE__, __LINE__,
546 |                       util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
547 |                       "Failure: recv completion error %d", wc.status);
548 |   }
549 | 
550 |   switch (wc.opcode) {
551 |     case IBV_WC_RECV:
552 |       ok = true;
553 | 
554 |       break;
555 | 
556 |     default:
557 |       util::IbcommError(__FILE__, __LINE__,
558 |                         util::IBCOMM_ERROR_CODE::IBVERBS_ERROR,
559 |                         "Failure: RecvPoll %d", wc.opcode);
560 |   }
561 | 
562 |   if (ok) {
563 |     // unregister memory region from non-blocking io wait list
564 |     auto& save = mr_world_[src_rank].second;
565 |     if (save != NULL) {
566 |       ibv_dereg_mr(save);
567 |       save = NULL;
568 |     }
569 |     return true;
570 |   }
571 |   return false;
572 | }
573 | 
574 | void IBVerbsCommunicator::RecvWait(int src_rank) {
575 |   while (!RecvPoll(src_rank)) {
576 |   }
577 | 
578 |   // unregister memory region from non-blocking io wait list
579 |   auto& save = mr_world_[src_rank].second;
580 |   if (save != NULL) {
581 |     ibv_dereg_mr(save);
582 |     save = NULL;
583 |   }
584 | }
585 | 
586 | void IBVerbsCommunicator::Bcast(void* buf, size_t len, int root) {
587 |   // This function provides naive Bcast;
588 | 
589 |   if (my_rank_ == root) {
590 |     // Bcast root
591 |     for (size_t i = 0; i < pq_world_.size(); i++) {
592 |       if (static_cast<int>(i) == my_rank_) continue;
593 | 
594 |       Send(i, buf, len, false);
595 |     }
596 | 
597 |     for (size_t i = 0; i < pq_world_.size(); i++) {
598 |       if (static_cast<int>(i) == my_rank_) continue;
599 | 
600 |       SendWait(i);
601 |     }
602 |   } else {
603 |     // Bcast non-root
604 |     Recv(root, buf, len);
605 |   }
606 | }
607 | 
608 | void IBVerbsCommunicator::PopMrAndDereg(std::queue<struct ibv_mr*>* q) {
609 |   if (q == nullptr)
610 |     util::IbcommError(__FILE__, __LINE__,
611 |                       util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
612 |                       "q is nullptr.");
613 | 
614 |   ibv_dereg_mr(q->front());
615 |   q->pop();
616 | }
617 | 
618 | namespace {
619 | int ReadChunkSize() {
620 |   const char* size = getenv("IBCOMM_CHUNKSIZE");
621 | 
622 |   if (size != NULL) {
623 |     int size_int = atoi(size);
624 | 
625 |     if (size_int <= 0)
626 |       util::IbcommError(__FILE__, __LINE__,
627 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
628 |                         "IBCOMM_CHUNKSIZE must be greater than 1");
629 | 
630 |     return size_int;
631 |   }
632 | 
633 |   return -1;  // use default size
634 | }
635 | };  // namespace
636 | 
637 | std::vector<std::pair<size_t, size_t>> IBVerbsCommunicator::SplitBuffer(
638 |     size_t len_elements, size_t sizeof_element) {
639 |   int chunks;
640 |   size_t elements_per_chunk;
641 | 
642 |   if (len_elements < world_size_) {
643 |     util::IbcommError(
644 |         __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::NOT_SUPPORTED,
645 |         "Input vector is too short for current Allreduce algorithm.\n"
646 |         "Incrase the number of the input vector to be larger than number of "
647 |         "processes.\n");
648 |   }
649 | 
650 |   int env_chunk_bytes = ReadChunkSize();
651 |   if (env_chunk_bytes != -1) {
652 |     // chunk_size is selected manually.
653 |     if (env_chunk_bytes % sizeof_element != 0) {
654 |       util::IbcommError(
655 |           __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
656 |           "Selected `IBCOMM_CHUNKSIZE` is not divisible by `sizeof(T)`.");
657 |     }
658 | 
659 |     elements_per_chunk = env_chunk_bytes / sizeof_element;
660 |     chunks = util::ceilDiv((size_t)len_elements, (size_t)elements_per_chunk);
661 | 
662 |     if (chunks < 2 * world_size_) {
663 |       util::IbcommError(__FILE__, __LINE__,
664 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
665 |                         "Selected `IBCOMM_CHUNKSIZE` is too large.\n"
666 |                         "Satisfy 2 * `world_size` <= `allreduce_bufsize` / "
667 |                         "`IBCOMM_CHUNKSIZE`.");
668 |     }
669 |   } else {
670 |     chunks = 4 * world_size_;
671 |     elements_per_chunk = util::ceilDiv((size_t)len_elements, (size_t)chunks);
672 |   }
673 | 
674 |   std::vector<std::pair<size_t, size_t>> ranges;
675 |   for (auto i = 0; i < chunks; i++) {
676 |     int start_index = elements_per_chunk * i;
677 |     int end_index = std::min(len_elements, elements_per_chunk * (i + 1));
678 | 
679 |     if (start_index < end_index) ranges.emplace_back(start_index, end_index);
680 |   }
681 | 
682 |   return ranges;
683 | }
684 | 
685 | std::vector<std::vector<int>> IBVerbsCommunicator::GetRankToChunk(
686 |     const std::vector<std::pair<size_t, size_t>>& ranges) {
687 |   std::vector<std::vector<int>> rank_to_chunk(world_size_);
688 |   size_t chunks_per_rank = ranges.size() / world_size_;
689 |   size_t chunks_per_rank_remainer = ranges.size() % world_size_;
690 |   int chunk_id = 0;
691 | 
692 |   for (int i = 0; i < world_size_; i++) {
693 |     for (int j = 0; j < chunks_per_rank; j++) {
694 |       rank_to_chunk[i].push_back(chunk_id);
695 |       chunk_id++;
696 |     }
697 | 
698 |     if (i < chunks_per_rank_remainer) {
699 |       rank_to_chunk[i].push_back(chunk_id);
700 |       chunk_id++;
701 |     }
702 |   }
703 | 
704 |   assert(chunk_id == ranges.size());
705 | 
706 |   return rank_to_chunk;
707 | }
708 | 


--------------------------------------------------------------------------------
/ibcomm/ibverbs_communicator.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include <infiniband/verbs.h>
  6 | 
  7 | #include <cstdint>
  8 | #include <ctime>
  9 | 
 10 | #include <queue>
 11 | #include <utility>
 12 | #include <vector>
 13 | 
 14 | #ifdef USE_CUDA
 15 | #include <cuda.h>
 16 | #include <cuda_runtime.h>
 17 | 
 18 | #include "ibcomm/memory_pool.h"
 19 | 
 20 | template <class MemoryAllocator>
 21 | class MemoryPool;
 22 | #endif
 23 | 
 24 | struct ProcessInfo {
 25 |   uint16_t lid;
 26 |   uint32_t qp_n;
 27 |   uint32_t psn;
 28 | };
 29 | 
 30 | struct ProcessQueue {
 31 |   struct ibv_cq* send_complete_queue;
 32 |   struct ibv_cq* recv_complete_queue;
 33 |   struct ibv_qp* queue_pair;
 34 | 
 35 |   ProcessQueue() {
 36 |     send_complete_queue = NULL;
 37 |     recv_complete_queue = NULL;
 38 |     queue_pair = NULL;
 39 |   }
 40 | 
 41 |   ProcessQueue(struct ibv_cq* scq, struct ibv_cq* rcq, struct ibv_qp* qp) {
 42 |     send_complete_queue = scq;
 43 |     recv_complete_queue = rcq;
 44 |     queue_pair = qp;
 45 |   }
 46 | 
 47 |   // copy
 48 |   ProcessQueue(const ProcessQueue&) = delete;
 49 |   ProcessQueue& operator=(const ProcessQueue&) = delete;
 50 | 
 51 |   // move
 52 |   // queues' are managed by IBVerbsCommunicator.
 53 |   ProcessQueue(ProcessQueue&&) noexcept = default;
 54 |   ProcessQueue& operator=(ProcessQueue&&) noexcept = default;
 55 | };
 56 | 
 57 | class Memory;
 58 | 
 59 | class IBVerbsCommunicator {
 60 |   // to export registerSendBuf, registerRecvBuf
 61 |   friend class MemoryBlock;
 62 | 
 63 |  public:
 64 |   // ctor
 65 |   IBVerbsCommunicator();
 66 |   explicit IBVerbsCommunicator(int world_size);
 67 | 
 68 |   // Manages infiniband-related resources thus we need to delete copy and move
 69 |   // ctors. copy
 70 |   IBVerbsCommunicator(const IBVerbsCommunicator&) noexcept = delete;
 71 |   IBVerbsCommunicator& operator=(const IBVerbsCommunicator&) noexcept = delete;
 72 | 
 73 |   // move
 74 |   IBVerbsCommunicator(IBVerbsCommunicator&&) noexcept = delete;
 75 |   IBVerbsCommunicator& operator=(IBVerbsCommunicator&&) noexcept = delete;
 76 | 
 77 |   // dtor
 78 |   ~IBVerbsCommunicator();
 79 | 
 80 |   // init
 81 |   void Init(int world_size);
 82 | 
 83 |   // connection management
 84 |   struct ProcessInfo RegisterProcess(int dest_rank, struct ProcessInfo pinfo);
 85 |   struct ProcessInfo CreateQueuePair(int dest_rank);
 86 |   void RegisterQueuePair(int dest_rank, struct ProcessInfo pinfo);
 87 |   void RegisterMyself(int my_rank);
 88 | 
 89 |   // send
 90 |   void Send(int dest_rank, const void* buf, size_t len, bool blocking = true);
 91 | 
 92 |   // recv
 93 |   void Recv(int src_rank, void* buf, size_t len, bool blocking = true);
 94 | 
 95 |   // wait ( for non-blocking io )
 96 |   bool SendPoll(int dest_rank);
 97 |   bool RecvPoll(int src_rank);
 98 |   void SendWait(int dest_rank);
 99 |   void RecvWait(int src_rank);
100 | 
101 |   // allreduce
102 |   template <typename T>
103 |   void AllreduceRing(const T* sendbuf, T* recvbuf, size_t len_elements);
104 | 
105 |   template <typename T>
106 |   void AllreduceRabenseifner(const T* sendbuf, T* recvbuf, size_t len_elements);
107 | 
108 | #ifdef USE_CUDA
109 |   template <typename T>
110 |   void AllreduceRingCuda(const T* sendbuf, T* recvbuf, size_t len_elements);
111 | 
112 |   template <typename T>
113 |   void AllreduceRabenseifnerCuda(const T* sendbuf, T* recvbuf,
114 |                                  size_t len_elements);
115 | 
116 |   void PrepareMemoryPool();
117 | #endif
118 | 
119 |   // bcast
120 |   void Bcast(void* buf, size_t len, int root);
121 | 
122 |   void SetTimerBase();
123 |   void DumpTrace() const;
124 | 
125 |  private:
126 |   bool initialized_ = false;
127 |   struct ibv_port_attr port_attr_ = {};
128 |   std::vector<ProcessQueue> pq_world_;
129 |   std::vector<uint32_t> psn_world_;
130 |   std::vector<std::pair<struct ibv_mr*, struct ibv_mr*>> mr_world_;
131 | 
132 |   struct ibv_mr* RegisterSendBuf(const void* buf, size_t len);
133 |   void SendRegistered(int dest_rank, const void* buf, struct ibv_mr* mr_buf,
134 |                       size_t len, bool blocking = true);
135 | 
136 |   struct ibv_mr* RegisterRecvBuf(void* buf, size_t len);
137 |   void RecvRegistered(int src_rank, const void* buf, struct ibv_mr* mr_buf,
138 |                       size_t len, bool blocking = true);
139 | 
140 |   void PopMrAndDereg(std::queue<struct ibv_mr*>* q);
141 | 
142 | #ifdef USE_CUDA
143 |   std::unique_ptr<MemoryPool<ConstantMemoryAllocator>> pool_;
144 | 
145 |   void* tmp_gpu_buffer_ = NULL;
146 |   size_t tmp_gpu_buffer_size_ = 0;
147 | #endif
148 | 
149 |   // need destruction variables
150 |   struct ibv_device** dev_list_ = NULL;
151 |   struct ibv_context* context_ = NULL;
152 | 
153 |   // Protection Domain
154 |   struct ibv_pd* pd_ = NULL;
155 | 
156 |   // local communication
157 |   int my_rank_ = -1;
158 |   size_t world_size_;
159 | 
160 |   // allreduce range func
161 |   // Splits buffer based given chunk size
162 |   std::vector<std::pair<size_t, size_t>> SplitBuffer(size_t len_elements,
163 |                                                      size_t len_per_element);
164 |   // Defines map (rank |-> chunk_ids)
165 |   std::vector<std::vector<int>> GetRankToChunk(
166 |       const std::vector<std::pair<size_t, size_t>>& ranges);
167 | 
168 |   struct timespec trace_start_;
169 | 
170 |   // receive is completed
171 |   std::vector<struct timespec> trace_received_;
172 | 
173 |   // reduction is completed
174 |   std::vector<struct timespec> trace_reduced_;
175 | 
176 |   // issue send
177 |   std::vector<struct timespec> trace_issue_send_;
178 | 
179 |   // issue copy-kernel call
180 |   std::vector<struct timespec> trace_issue_copy_kernel_;
181 | 
182 |   // issue reduce-kernel call
183 |   std::vector<struct timespec> trace_issue_redu_kernel_;
184 | 
185 |   // issue recv
186 |   std::vector<struct timespec> trace_issue_recv_;
187 | 
188 |   // others
189 |   std::vector<struct timespec> trace_other_;
190 | };
191 | 
192 | #include "ibcomm/allreduce_cpu_impl.h"
193 | #include "ibcomm/allreduce_cuda_impl.h"
194 | 


--------------------------------------------------------------------------------
/ibcomm/ibverbs_communicator_cuda.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
 2 | 
 3 | #ifdef USE_CUDA
 4 | #include "ibcomm/ibverbs_communicator.h"
 5 | 
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | #include "ibcomm/memory_pool.h"
10 | 
11 | namespace {
12 | int ReadWorkGpuMemorySize() {
13 |   const char* size = getenv("IBCOMM_WORK_GPU_MEMORY_SIZE");
14 | 
15 |   if (size != NULL) {
16 |     int size_int = atoi(size);
17 | 
18 |     return size_int;
19 |   }
20 | 
21 |   return -1;  // use default size
22 | }
23 | 
24 | };  // namespace
25 | 
26 | void IBVerbsCommunicator::PrepareMemoryPool() {
27 |   pool_.reset(new MemoryPool<ConstantMemoryAllocator>(this));
28 | 
29 |   tmp_gpu_buffer_size_ = ReadWorkGpuMemorySize();
30 | 
31 |   if (tmp_gpu_buffer_size_ == -1) {
32 |     tmp_gpu_buffer_size_ = 32 * 1024 * 1024;
33 |   }
34 | 
35 |   CUDACHECK(
36 |       cudaMalloc(static_cast<void**>(&tmp_gpu_buffer_), tmp_gpu_buffer_size_));
37 | }
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/ibcomm/memory_pool.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
 2 | 
 3 | #include "ibcomm/memory_pool.h"
 4 | 
 5 | #include "ibcomm/ibverbs_communicator.h"
 6 | #include "ibcomm/util.h"
 7 | 
 8 | #ifdef USE_CUDA
 9 | 
10 | // ~~~ Memory class ~~~ //
11 | Memory::Memory(MemoryBlock* block, size_t offset)
12 |     : block_(*block), offset_(offset) {
13 |   if (block == nullptr)
14 |     util::IbcommError(__FILE__, __LINE__,
15 |                       util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
16 |                       "block is nullptr.");
17 | }
18 | 
19 | Memory* Memory::SetStream(cudaStream_t stream) {
20 |   stream_ = stream;
21 |   return this;
22 | }
23 | 
24 | Memory* Memory::UnsetStream() { return SetStream(NULL); }
25 | 
26 | // ~~~ MemoryBlock class ~~~ //
27 | MemoryBlock::MemoryBlock(size_t size, IBVerbsCommunicator* comm)
28 |     : comm_(*comm), length_(size) {
29 |   if (comm == nullptr)
30 |     util::IbcommError(__FILE__, __LINE__,
31 |                       util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
32 |                       "comm is nullptr.");
33 | 
34 |   // not thread safe
35 |   CUDACHECK(cudaHostAlloc(&ptr_, length_, cudaHostAllocDefault));
36 |   mr_ = comm->RegisterRecvBuf(ptr_, length_);
37 | }
38 | 
39 | MemoryBlock::~MemoryBlock() {
40 |   ibv_dereg_mr(mr_);
41 |   CUDACHECK(cudaFreeHost(ptr_));
42 | }
43 | 
44 | ConstantMemoryAllocator::ConstantMemoryAllocator(size_t initial_size,
45 |                                                  IBVerbsCommunicator* comm)
46 |     : size_(initial_size), comm_(*comm) {
47 |   if (comm == nullptr)
48 |     util::IbcommError(__FILE__, __LINE__,
49 |                       util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
50 |                       "comm is nullptr.");
51 | }
52 | 
53 | std::unique_ptr<MemoryBlock> ConstantMemoryAllocator::Allocate() {
54 |   return std::unique_ptr<MemoryBlock>(new MemoryBlock(size_, &comm_));
55 | }
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/ibcomm/memory_pool.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #ifdef USE_CUDA
  6 | 
  7 | #include <cuda.h>
  8 | #include <cuda_runtime.h>
  9 | 
 10 | #include <memory>
 11 | #include <queue>
 12 | #include <vector>
 13 | 
 14 | #include "ibcomm/util.h"
 15 | 
 16 | class IBVerbsCommunicator;
 17 | 
 18 | /* Concept:
 19 |  * MemoryPool : manages (raw-) `MemoryBlock`. unit of `ibv_reg_mr`.
 20 |  * MemoryController : manages `Memory`. unit of `cudaStream`.
 21 |  * MemoryAllocator : allocates `MemoryBlock`.
 22 |  *  MemoryAllocator(size_t initial_size, IBVerbsCommunicator& comm);
 23 |  *  std::unique_ptr<MemoryBlock> Allocate();
 24 |  * MemoryBlock : memory.
 25 |  * Memory : chunk.
 26 |  */
 27 | 
 28 | class MemoryBlock {
 29 |  public:
 30 |   // ctor
 31 |   MemoryBlock(size_t size, IBVerbsCommunicator* comm);
 32 | 
 33 |   // Manages raw pointers thus we need to delete copy and move ctors.
 34 |   // copy
 35 |   MemoryBlock(const MemoryBlock&) noexcept = delete;
 36 |   MemoryBlock& operator=(const MemoryBlock&) noexcept = delete;
 37 | 
 38 |   // move
 39 |   MemoryBlock(MemoryBlock&&) noexcept = delete;
 40 |   MemoryBlock& operator=(MemoryBlock&&) noexcept = delete;
 41 | 
 42 |   ~MemoryBlock();
 43 | 
 44 |   inline void* ptr() { return ptr_; }
 45 |   inline size_t length() const { return length_; }
 46 |   inline struct ibv_mr* mr() { return mr_; }
 47 | 
 48 |  private:
 49 |   IBVerbsCommunicator& comm_;
 50 | 
 51 |   void* ptr_;
 52 |   size_t length_;
 53 |   struct ibv_mr* mr_;
 54 | };
 55 | 
 56 | class Memory {
 57 |  public:
 58 |   Memory(MemoryBlock* block, size_t offset);
 59 | 
 60 |   inline void* ptr() {
 61 |     return static_cast<void*>(static_cast<char*>(block_.ptr()) + offset_);
 62 |   }
 63 |   inline cudaStream_t stream() { return stream_; }
 64 |   inline struct ibv_mr* mr() { return block_.mr(); }
 65 |   Memory* SetStream(cudaStream_t stream);
 66 |   Memory* UnsetStream();
 67 | 
 68 |  private:
 69 |   MemoryBlock& block_;
 70 |   size_t offset_;
 71 |   cudaStream_t stream_;
 72 | };
 73 | 
 74 | template <class MemoryAllocator>
 75 | class MemoryController;
 76 | 
 77 | template <class MemoryAllocator>
 78 | class MemoryPool {
 79 |   friend class MemoryController<MemoryAllocator>;
 80 | 
 81 |  public:
 82 |   static constexpr int DefaultMaxNumCudaStream = 128;
 83 |   static constexpr int DefaultPreAllocSize = 64 * 1024 * 1024;  // 64 MB.
 84 | 
 85 |   // ctor
 86 |   explicit MemoryPool(IBVerbsCommunicator* comm)
 87 |       : comm_(*comm),
 88 |         cuda_streams_(ReadNumCudaStream()),
 89 |         allocator_(ReadPreAllocSize(), comm) {
 90 |     if (comm == nullptr)
 91 |       util::IbcommError(__FILE__, __LINE__,
 92 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
 93 |                         "comm is nullptr.");
 94 | 
 95 |     for (auto& stream : cuda_streams_) {
 96 |       CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 97 |     }
 98 | 
 99 |     Allocate();
100 |   }
101 | 
102 |   // Manages cudaStream_t thus we need to delete copy and move ctors.
103 |   // copy
104 |   MemoryPool(const MemoryPool&) = delete;
105 |   MemoryPool& operator=(const MemoryPool&) = delete;
106 | 
107 |   // move
108 |   MemoryPool(MemoryPool&&) = delete;
109 |   MemoryPool& operator=(MemoryPool&&) = delete;
110 | 
111 |   MemoryController<MemoryAllocator> GetController(size_t chunk_size) {
112 |     if (controller_in_use_) {
113 |       util::IbcommError(__FILE__, __LINE__,
114 |                         util::IBCOMM_ERROR_CODE::NOT_SUPPORTED,
115 |                         "Currently, MemoryController is in use.");
116 |     }
117 | 
118 |     controller_in_use_ = true;
119 |     return MemoryController<MemoryAllocator>(this, chunk_size, cuda_streams_,
120 |                                              memory_blocks_);
121 |   }
122 | 
123 |   ~MemoryPool() {
124 |     for (auto& stream : cuda_streams_) {
125 |       cudaStreamDestroy(stream);
126 |     }
127 |   }
128 | 
129 |  private:
130 |   IBVerbsCommunicator& comm_;
131 | 
132 |   MemoryAllocator allocator_;
133 |   std::vector<std::unique_ptr<MemoryBlock>> memory_blocks_;
134 |   std::vector<cudaStream_t> cuda_streams_;
135 |   bool controller_in_use_ = false;
136 | 
137 |   // Read NumCudaStream from environmental variable.
138 |   // Returns the default size `DefaultMaxNumCudaStream` if it is not set.
139 |   int ReadNumCudaStream() {
140 |     const char* envvar = getenv("IBCOMM_NUM_CUDA_STREAM");
141 |     if (envvar) {
142 |       int n = atoi(envvar);
143 | 
144 |       if (n <= 0 || n > 1024) {
145 |         util::IbcommError(
146 |             __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
147 |             "Invalid value for IBCOMM_NUM_CUDA_STREAM: %s", envvar);
148 |       }
149 | 
150 |       return n;
151 |     } else {
152 |       return DefaultMaxNumCudaStream;
153 |     }
154 |   }
155 | 
156 |   // Read PreAllocateSize from environmental variable.
157 |   // Returns the default size `DefaultPreAllocSize` if it is not set.
158 |   int ReadPreAllocSize() {
159 |     const char* envvar = getenv("IBCOMM_MEMORY_POOL_PRE_ALLOC");
160 |     if (envvar) {
161 |       int n = atoi(envvar);
162 | 
163 |       if (n < 4 || n > 1 * 1024 * 1024 * 1024) {
164 |         util::IbcommError(
165 |             __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
166 |             "Invalid value for IBCOMM_MEMORY_POOL_PRE_ALLOC: %s", envvar);
167 |       }
168 | 
169 |       return n;
170 |     } else {
171 |       return DefaultPreAllocSize;
172 |     }
173 |   }
174 | 
175 |   void CompleteMemoryController() { controller_in_use_ = false; }
176 | 
177 |   std::unique_ptr<MemoryBlock>& Allocate() {
178 |     memory_blocks_.push_back(allocator_.Allocate());
179 |     return memory_blocks_.back();
180 |   }
181 | 
182 |   cudaStream_t AddCudaStream() {
183 |     cudaStream_t stream;
184 |     CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
185 |     cuda_streams_.push_back(stream);
186 | 
187 |     return stream;
188 |   }
189 | };
190 | 
191 | template <class MemoryAllocator>
192 | class MemoryController {
193 |  public:
194 |   MemoryController(MemoryPool<MemoryAllocator>* pool, size_t chunk_size,
195 |                    const std::vector<cudaStream_t>& streams,
196 |                    const std::vector<std::unique_ptr<MemoryBlock>>& blocks)
197 |       : pool_(*pool),
198 |         chunk_size_(chunk_size),
199 |         streams_(streams),
200 |         blocks_(blocks) {
201 |     if (pool == nullptr)
202 |       util::IbcommError(__FILE__, __LINE__,
203 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
204 |                         "pool is nullptr.");
205 | 
206 |     for (auto stream : streams_) vacant_streams_.push(stream);
207 | 
208 |     for (const auto& block : blocks_) {
209 |       AddMemoryBlockToVacantMemories(block.get());
210 |     }
211 |   }
212 | 
213 |   // Manages Memory thus we need to delete copy ctors.
214 |   // copy
215 |   MemoryController(const MemoryController&) = delete;
216 |   MemoryController& operator=(const MemoryController&) = delete;
217 | 
218 |   // move
219 |   MemoryController(MemoryController&&) = default;
220 |   MemoryController& operator=(MemoryController&&) = default;
221 | 
222 |   Memory* getMemory() {
223 |     if (vacant_streams_.empty()) {
224 |       vacant_streams_.push(pool_.AddCudaStream());
225 |     }
226 |     cudaStream_t stream = vacant_streams_.front();
227 |     vacant_streams_.pop();
228 | 
229 |     if (vacant_memories_.empty()) {
230 |       AddMemoryBlockToVacantMemories(pool_.Allocate().get());
231 |     }
232 |     Memory* memory = vacant_memories_.front();
233 |     vacant_memories_.pop();
234 | 
235 |     return memory->SetStream(stream);
236 |   }
237 | 
238 |   void returnMemory(Memory* memory) {
239 |     auto stream = memory->stream();
240 |     vacant_memories_.push(memory->UnsetStream());
241 |     vacant_streams_.push(stream);
242 |   }
243 | 
244 |   ~MemoryController() {
245 |     while (!vacant_memories_.empty()) {
246 |       auto memory = vacant_memories_.front();
247 |       vacant_memories_.pop();
248 | 
249 |       delete memory;
250 |     }
251 |     pool_.CompleteMemoryController();
252 |   }
253 | 
254 |  private:
255 |   MemoryPool<MemoryAllocator>& pool_;
256 |   size_t chunk_size_;
257 |   const std::vector<std::unique_ptr<MemoryBlock>>& blocks_;
258 |   const std::vector<cudaStream_t>& streams_;
259 | 
260 |   std::queue<cudaStream_t> vacant_streams_;
261 |   std::queue<Memory*> vacant_memories_;
262 | 
263 |   void AddMemoryBlockToVacantMemories(MemoryBlock* block) {
264 |     if (block == nullptr)
265 |       util::IbcommError(__FILE__, __LINE__,
266 |                         util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
267 |                         "block is nullptr.");
268 | 
269 |     for (int i = 0; (i + 1) * chunk_size_ < block->length(); i++) {
270 |       vacant_memories_.push(new Memory(block, i * chunk_size_));
271 |     }
272 |   }
273 | };
274 | 
275 | class ConstantMemoryAllocator {
276 |  public:
277 |   ConstantMemoryAllocator(size_t initial_size, IBVerbsCommunicator* comm);
278 |   std::unique_ptr<MemoryBlock> Allocate();
279 | 
280 |  private:
281 |   IBVerbsCommunicator& comm_;
282 |   size_t size_;
283 | };
284 | 
285 | #endif
286 | 


--------------------------------------------------------------------------------
/ibcomm/util.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include <sys/time.h>
  6 | 
  7 | #include <cassert>
  8 | #include <cstdint>
  9 | #include <cstdio>
 10 | #include <cstdlib>
 11 | #include <cstring>
 12 | 
 13 | #include <sstream>
 14 | #include <stdexcept>
 15 | #include <string>
 16 | #include <vector>
 17 | 
 18 | #ifdef USE_CUDA
 19 | #define CUDACHECK(cmd)                                       \
 20 |   do {                                                       \
 21 |     cudaError_t e = cmd;                                     \
 22 |     if (e != cudaSuccess) {                                  \
 23 |       util::IbcommError(__FILE__, __LINE__,                  \
 24 |                         util::IBCOMM_ERROR_CODE::CUDA_ERROR, \
 25 |                         cudaGetErrorString(e));              \
 26 |     }                                                        \
 27 |                                                              \
 28 |   } while (0)
 29 | #endif
 30 | 
 31 | namespace util {
 32 | enum class IBCOMM_ERROR_CODE : int {
 33 |   INVALID_ARGUMENT = 1,
 34 | 
 35 |   // Error occured in InfiniBand Verbs call.
 36 |   IBVERBS_ERROR = 2,
 37 | 
 38 |   // Error occured in CUDA call.
 39 |   CUDA_ERROR = 3,
 40 | 
 41 |   NOT_SUPPORTED = 4
 42 | };
 43 | 
 44 | template <typename... Args>
 45 | void IbcommError(const char* filename, int line, IBCOMM_ERROR_CODE error_code,
 46 |                  const char* format, Args const&... args) {
 47 |   fprintf(stderr, "Error occured at %s:L%d.\n", filename, line);
 48 |   fprintf(stderr, format, args...);
 49 |   fputs("", stderr);
 50 | 
 51 |   exit(static_cast<int>(error_code));
 52 | }
 53 | 
 54 | template <typename... Args>
 55 | void IbcommWarning(const char* filename, int line, const char* format,
 56 |                    Args const&... args) {
 57 |   fprintf(stderr, "Warning occured at %s:L%d.\n", filename, line);
 58 |   fprintf(stderr, format, args...);
 59 |   fputs("", stderr);
 60 | }
 61 | 
 62 | inline void trace(std::vector<struct timespec>* v) {
 63 |   if (v == nullptr)
 64 |     util::IbcommError(__FILE__, __LINE__,
 65 |                       util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT,
 66 |                       "v is nullptr.");
 67 |   struct timespec ts;
 68 |   clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
 69 |   v->push_back(ts);
 70 | }
 71 | 
 72 | class MalformedNumber : public std::runtime_error {
 73 |  public:
 74 |   explicit MalformedNumber(const std::string& ss)
 75 |       : std::runtime_error(ss.c_str()) {}
 76 | };
 77 | 
 78 | // Parse a string and get a buffer size or chunk size. SI prefix is supported.
 79 | // If any error occurs, success is set to false and the error message is
 80 | // assigned to msg.
 81 | inline int64_t parse_number(const char* str) {
 82 |   std::string n;
 83 |   int64_t multiply = 1;
 84 |   int pos = 0;
 85 |   const int len = strlen(str);
 86 | 
 87 |   if (str[0] == '+' || str[0] == '-') {
 88 |     // accept '-' for now to detect value range error.
 89 |     n += str[0];
 90 |     pos = 1;
 91 |   }
 92 | 
 93 |   while (isdigit(str[pos]) && pos < len) {
 94 |     n += str[pos];
 95 |     pos++;
 96 |   }
 97 | 
 98 |   if (n.size() == 0) {
 99 |     // there seems no number
100 |     std::stringstream ss;
101 |     ss << "Illegal number format prefix in '" << str << "'";
102 |     throw MalformedNumber(ss.str());
103 |   }
104 | 
105 |   if (pos < len) {
106 |     // parse SI prefix
107 |     switch (str[pos]) {
108 |       case 'k':
109 |       case 'K':
110 |         multiply = 1024ul;
111 |         pos++;
112 |         break;
113 |       case 'm':
114 |       case 'M':
115 |         multiply = 1024ul * 1024;
116 |         pos++;
117 |         break;
118 |       case 'g':
119 |       case 'G':
120 |         multiply = 1024ul * 1024 * 1024;
121 |         pos++;
122 |         break;
123 |         // default:
124 |         //   {
125 |         //     std::stringstream ss;
126 |         //     ss << "Illegal SI prefix in '" << str << "'";
127 |         //     throw MalformedNumber(ss.str());
128 |         //   }
129 |     }
130 |   }
131 | 
132 |   if (pos < len) {
133 |     // Last 'b' or 'B' (bytes) is optional. Other characters are not allowed.
134 |     if (!(str[pos] == 'b' || str[pos] == 'B')) {
135 |       std::stringstream ss;
136 |       ss << "Illegal SI prefix in '" << str << "'";
137 |       throw MalformedNumber(ss.str());
138 |     }
139 |     pos++;
140 |   }
141 |   if (pos < len) {
142 |     std::stringstream ss;
143 |     ss << "Illegal number format prefix in '" << str << "'";
144 |     throw MalformedNumber(ss.str());
145 |   }
146 | 
147 |   int64_t n2 = atol(n.c_str());
148 |   return n2 * multiply;
149 | }
150 | 
151 | template <typename T>
152 | inline T ceilDiv(T v1, T v2) {
153 |   return v1 % v2 ? v1 / v2 + 1 : v1 / v2;
154 | }
155 | 
156 | inline int GetExpOfTwo(int n) {
157 |   int p = 0;
158 | 
159 |   while (n != 0) {
160 |     if (n % 2 == 1) {
161 |       if (n == 1)
162 |         return p;
163 |       else
164 |         return 0;
165 |     }
166 | 
167 |     p++;
168 | 
169 |     n >>= 1;
170 |   }
171 | 
172 |   return 0;
173 | 
174 |   // returns p (2^p == n)
175 | }
176 | };  // namespace util
177 | 


--------------------------------------------------------------------------------
/mpinvcc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Dirty hack :)
 4 | if echo "$*" | grep -qE -- '-DUSE_CUDA' ; then
 5 |     NVCC=1
 6 | fi
 7 | 
 8 | if [ "$NVCC" == 1 ]; then
 9 |     export OMPI_CXX=nvcc
10 |     export MPICH_CXX=nvcc
11 |     nvcc_arch_flag="-arch sm_52"
12 | 
13 |     if echo "$*" | grep -qE "(.cpp|.cxx|.cc)$" ; then
14 |         xflag="-x cu"
15 |     else
16 |         xflag=
17 |     fi
18 | else
19 |     nvcc_arch_flag=
20 |     if [ -n "$CXX" ]; then
21 |         export OMPI_CXX=${CXX}
22 |         export MPICH_CXX=${CXX}
23 |     fi
24 | fi
25 | 
26 | CMD=$(mpicxx -show "$@" "$xflag" "$nvcc_arch_flag")
27 | 
28 | if [ "$NVCC" == 1 ]; then
29 |     CMD=$(echo $CMD | sed -e "s/-Wl,/-Xlinker /g")
30 |     CMD=$(echo $CMD | sed -e "s/\(-W[^ ][^ ]*\)/-Xcompiler \\1/g")
31 |     CMD=$(echo $CMD | sed -e "s/\\(-pthread\\)/-Xcompiler \\1/g")
32 |     CMD=$(echo $CMD | sed -e "s/\\(-rdynamic\\)/-Xcompiler \\1/g")
33 |     CMD=$(echo $CMD | sed -e "s/\\(-fPIC\\)/-Xcompiler \\1/g")
34 | fi
35 | 
36 | echo $CMD
37 | $CMD
38 | 


--------------------------------------------------------------------------------
/tests/allreduce_test.py:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen
  2 | from typing import List
  3 | from typing import Optional
  4 | from typing import Union
  5 | import itertools
  6 | import os
  7 | import os.path
  8 | import pytest
  9 | import unittest
 10 | import sys
 11 | 
 12 | 
 13 | IBCOMM_INVALID_ARGUMENT = 1
 14 | IBCOMM_IBVERBS_ERROR = 2
 15 | IBCOMM_CUDA_ERROR = 3
 16 | IBCOMM_NOT_SUPPORTED = 4
 17 | 
 18 | ALGO_RING = "ring"
 19 | ALGO_RABEN = "rabenseifner"
 20 | 
 21 | IBCOMM_ALGORITHMS = [ALGO_RING, ALGO_RABEN]
 22 | 
 23 | 
 24 | def find_file(directory: str, fname: str) -> Optional[str]:
 25 |     """Find a file in directory. Used to find the allreduce_tester binary."""
 26 |     for root, dirs, files in os.walk(directory):
 27 |         if fname in files:
 28 |             return os.path.join(root, fname)
 29 | 
 30 |     # not found
 31 |     return None
 32 | 
 33 | 
 34 | def flatten1(lst: List[List]) -> List:
 35 |     return [item for sublist in lst for item in sublist]
 36 | 
 37 | 
 38 | def dict_to_envs(envs: dict) -> List[str]:
 39 |     """
 40 |     Expand a dict to command line arguments for Open MPI's '-x' option
 41 |     e.g.) {'FOO': 100} 
 42 |            ==> 
 43 |           ['-x', 'FOO=100']
 44 |     """
 45 |     z = itertools.zip_longest([],
 46 |                               ["{}={}".format(k,v) for k, v in envs.items()],
 47 |                               fillvalue='-x')
 48 | 
 49 |     return flatten1(list(z))
 50 | 
 51 | 
 52 | # Find the project directory
 53 | ProjectDir = os.path.join(os.path.dirname(__file__), os.pardir)
 54 | Tester = find_file(ProjectDir, 'allreduce_tester')
 55 | 
 56 | if not os.path.exists(Tester):
 57 |     sys.stderr.write("Please build 'allreduce_tester' before running unit tests.\n")
 58 |     exit(1)
 59 | 
 60 | 
 61 | class AllreduceTest(unittest.TestCase):
 62 |     @staticmethod
 63 |     def check(np: Union[int, str],
 64 |               algo: str,
 65 |               buffsize: Union[int, str],
 66 |               init_expr: str = "i*np+p",
 67 |               check_expr: str = "i*np*np+np*(np-1)/2",
 68 |               chunksize: Optional[Union[str, int]] = None):
 69 |         env = {}
 70 |         if chunksize is not None:
 71 |             env['IBCOMM_CHUNKSIZE'] = chunksize
 72 | 
 73 |         if 'NODEFILE' in os.environ:
 74 |             hostfile = os.environ['NODEFILE']
 75 |         elif 'PBS_NODEFILE' in os.environ:
 76 |             hostfile = os.environ['PBS_NODEFILE']
 77 |         else:
 78 |             hostfile = None
 79 | 
 80 |         assert algo in IBCOMM_ALGORITHMS, "{} is not support Allreduce algorithm.".format(algo)
 81 | 
 82 |         env_args = dict_to_envs(env)
 83 |         np = str(np)
 84 |         buffsize = str(buffsize)
 85 | 
 86 |         if hostfile is not None:
 87 |             hostfile = ['--hostfile', hostfile]
 88 |         else:
 89 |             hostfile = []
 90 | 
 91 |         cmd = ['timeout', '90s', 'mpiexec', '-np', np, *hostfile, *env_args, Tester, algo, buffsize, init_expr, check_expr]
 92 | 
 93 |         print()
 94 |         print(' '.join(cmd))
 95 |         p = Popen(cmd)
 96 |         out,err = p.communicate()
 97 |         return p.returncode
 98 | 
 99 |     def setUp(self):
100 |         pass
101 | 
102 |     def test_1proc(self):
103 |         # Allreduce works with just 1 process.
104 |         ret = AllreduceTest.check(algo=ALGO_RING, np=1, buffsize=1024)
105 |         assert ret == 0
106 | 
107 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=1, buffsize=1024)
108 |         assert ret == 0
109 | 
110 |     def test_small_buffer(self):
111 |         # Tests Ring-AllReduce
112 |         int_size = 4
113 | 
114 |         # Allreduce works with small buffer size
115 |         NP=1
116 |         ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size)
117 |         assert ret == 0
118 | 
119 |         NP=2
120 |         ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size)
121 |         assert ret == 0
122 | 
123 |         NP=3
124 |         ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size)
125 |         assert ret == 0
126 | 
127 |         NP=5
128 |         ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size)
129 |         assert ret == 0
130 | 
131 |         NP=2
132 |         # Relatively larger prime
133 |         ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=2521 * int_size)
134 |         assert ret == 0
135 | 
136 |         # Tests Rabenseifner's algorithm
137 |         NP=1
138 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size)
139 |         assert ret == 0
140 | 
141 |         NP=2
142 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size)
143 |         assert ret == 0
144 | 
145 |         NP=3
146 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size)
147 |         assert ret == IBCOMM_NOT_SUPPORTED  # Currently, non-power-of-2 np is not supported.
148 | 
149 |         NP=5
150 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size)
151 |         assert ret == IBCOMM_NOT_SUPPORTED  # Currently, non-power-of-2 np is not supported.
152 | 
153 |         NP=2
154 |         # Relatively larger prime
155 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=2521 * int_size)
156 |         assert ret == 0
157 | 
158 |     def test_basic(self):
159 |         # Relatively larger buffer size and default chunksize
160 |         # Tests Ring-AllReduce
161 |         ret = AllreduceTest.check(algo=ALGO_RING, np=2, buffsize="128M")
162 |         assert ret == 0
163 | 
164 |         ret = AllreduceTest.check(algo=ALGO_RING, np=3, buffsize="128M")
165 |         assert ret == 0
166 | 
167 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="128M")
168 |         assert ret == 0
169 | 
170 |         # Test Rabenseifner's algorithm
171 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=2, buffsize="128M")
172 |         assert ret == 0
173 | 
174 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=3, buffsize="128M")
175 |         assert ret == IBCOMM_NOT_SUPPORTED  # Currently, non-power-of-2 np is not supported.
176 | 
177 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=4, buffsize="128M")
178 |         assert ret == 0
179 | 
180 |     def test_chunk_size(self):
181 |         # Tests Ring-AllReduce
182 |         # for a buffer size 1024 and NP 2, change the IBCOMM_CHUNKSIZE from [4 to 128]
183 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='4')
184 |         assert ret == 0
185 | 
186 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='8')
187 |         assert ret == 0
188 | 
189 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='16')
190 |         assert ret == 0
191 | 
192 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='32')
193 |         assert ret == 0
194 | 
195 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='64')
196 |         assert ret == 0
197 | 
198 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='128')
199 |         assert ret == 0
200 | 
201 |         # Test of Rabenseifner's algorithm is not necessary because chunksize is not used.
202 | 
203 |     def test_invalid_error(self):
204 |         # Test if ibcomm checks chunk size
205 |         int_size = 4
206 |         for chunk_size in range(0, int_size):  # try 0, 1, 2, 3
207 |             # chunk_size must be a multiply of element type (which is int here)
208 |             ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize=chunk_size)
209 |             assert ret == IBCOMM_INVALID_ARGUMENT
210 | 
211 |         # Chunk size < 0
212 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize="-128")
213 |         assert ret == IBCOMM_INVALID_ARGUMENT
214 | 
215 |         # Chunk size is too large
216 |         ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize="1k")
217 |         assert ret == IBCOMM_INVALID_ARGUMENT
218 | 
219 |         # Check too short vector
220 |         int_size = 4
221 |         ret = AllreduceTest.check(algo=ALGO_RING, np=2, buffsize=int_size, init_expr="1", check_expr="np")
222 |         assert ret == IBCOMM_NOT_SUPPORTED
223 | 
224 |         ret = AllreduceTest.check(algo=ALGO_RABEN, np=2, buffsize=int_size, init_expr="1", check_expr="np")
225 |         assert ret == IBCOMM_NOT_SUPPORTED
226 | 
227 |     @pytest.mark.slow
228 |     def test_aging(self):
229 |         for i in range(100):
230 |             ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="128M")
231 |             assert ret == 0
232 | 
233 |         for i in range(100):
234 |             ret = AllreduceTest.check(algo=ALGO_RABEN, np=4, buffsize="128M")
235 |             assert ret == 0
236 | 


--------------------------------------------------------------------------------
/tests/sendrecv_test.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
 2 | 
 3 | #include <cassert>
 4 | #include <cerrno>
 5 | #include <cstdio>
 6 | #include "ibcomm/ibverbs_communicator.h"
 7 | 
 8 | int main(void) {
 9 |   IBVerbsCommunicator comm(2);
10 | 
11 |   ProcessInfo pinfoA = comm.CreateQueuePair(1);
12 |   ProcessInfo pinfoB = comm.RegisterProcess(0, pinfoA);
13 |   comm.RegisterQueuePair(1, pinfoB);
14 | 
15 |   int value = 10;
16 |   int value2 = -1;
17 | 
18 |   comm.Send(0, &value, sizeof(value), false);
19 |   comm.Recv(1, &value2, sizeof(value2));
20 |   comm.SendWait(0);
21 | 
22 |   assert(value == value2);
23 | 
24 |   value2 = -1;
25 |   comm.Send(0, &value2, sizeof(value), false);
26 |   comm.Recv(1, &value, sizeof(value2));
27 |   comm.SendWait(0);
28 | 
29 |   assert(value == value2);
30 | 
31 |   return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/unittest.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved.
 2 | 
 3 | #include "gtest/gtest.h"
 4 | #include "ibcomm/util.h"
 5 | 
 6 | namespace {
 7 | 
 8 | class IBCommUtilTest : public ::testing::Test {
 9 |  protected:
10 | };
11 | 
12 | TEST_F(IBCommUtilTest, ParseNumberZero) {
13 |   EXPECT_EQ(0, util::parse_number("0"));
14 |   EXPECT_EQ(0, util::parse_number("0b"));
15 |   EXPECT_EQ(0, util::parse_number("0B"));
16 |   EXPECT_EQ(0, util::parse_number("0k"));
17 |   EXPECT_EQ(0, util::parse_number("0kb"));
18 |   EXPECT_EQ(0, util::parse_number("0K"));
19 |   EXPECT_EQ(0, util::parse_number("0m"));
20 |   EXPECT_EQ(0, util::parse_number("0mb"));
21 |   EXPECT_EQ(0, util::parse_number("0M"));
22 |   EXPECT_EQ(0, util::parse_number("0g"));
23 |   EXPECT_EQ(0, util::parse_number("0gb"));
24 |   EXPECT_EQ(0, util::parse_number("0G"));
25 | 
26 |   EXPECT_EQ(0, util::parse_number("-0"));
27 |   EXPECT_EQ(0, util::parse_number("-0b"));
28 |   EXPECT_EQ(0, util::parse_number("-0B"));
29 |   EXPECT_EQ(0, util::parse_number("-0k"));
30 |   EXPECT_EQ(0, util::parse_number("-0kb"));
31 |   EXPECT_EQ(0, util::parse_number("-0K"));
32 |   EXPECT_EQ(0, util::parse_number("-0m"));
33 |   EXPECT_EQ(0, util::parse_number("-0mb"));
34 |   EXPECT_EQ(0, util::parse_number("-0M"));
35 |   EXPECT_EQ(0, util::parse_number("-0g"));
36 |   EXPECT_EQ(0, util::parse_number("-0gb"));
37 |   EXPECT_EQ(0, util::parse_number("-0G"));
38 | }
39 | 
40 | TEST_F(IBCommUtilTest, ParseNumberPositive) {
41 |   EXPECT_EQ(1, util::parse_number("1"));
42 |   EXPECT_EQ(1, util::parse_number("1b"));
43 |   EXPECT_EQ(1, util::parse_number("1B"));
44 | 
45 |   EXPECT_EQ(1024, util::parse_number("1k"));
46 |   EXPECT_EQ(1024, util::parse_number("1kb"));
47 | 
48 |   EXPECT_EQ(31 * 1024, util::parse_number("31k"));
49 |   EXPECT_EQ(31 * 1024, util::parse_number("31kb"));
50 | 
51 |   EXPECT_EQ(713ul * 1024 * 1024, util::parse_number("713m"));
52 |   EXPECT_EQ(713ul * 1024 * 1024, util::parse_number("713mb"));
53 | }
54 | 
55 | TEST_F(IBCommUtilTest, ParseNumberMalformed) {
56 |   ASSERT_THROW(util::parse_number("0.5"), util::MalformedNumber);
57 |   ASSERT_THROW(util::parse_number("a"), util::MalformedNumber);
58 |   ASSERT_THROW(util::parse_number("b"), util::MalformedNumber);
59 |   ASSERT_THROW(util::parse_number("B"), util::MalformedNumber);
60 |   ASSERT_THROW(util::parse_number("0x"), util::MalformedNumber);
61 |   ASSERT_THROW(util::parse_number("97MiB"),
62 |                util::MalformedNumber);  // "Mibi byte" is not supported
63 | }
64 | 
65 | TEST_F(IBCommUtilTest, get_exp_of_two) {
66 |   ASSERT_EQ(0, util::get_exp_of_two(0));
67 |   ASSERT_EQ(1, util::get_exp_of_two(2));
68 |   ASSERT_EQ(7, util::get_exp_of_two(128));
69 |   ASSERT_EQ(0, util::get_exp_of_two(127));
70 |   ASSERT_EQ(0, util::get_exp_of_two(129));
71 | }
72 | 
73 | }  // namespace
74 | 


--------------------------------------------------------------------------------