├── .gitmodules ├── CMakeLists.txt ├── CPPLINT.cfg ├── LICENSE ├── README.md ├── examples └── allreduce.cpp ├── ibcomm ├── allreduce_cpu_impl.h ├── allreduce_cuda_impl.h ├── allreduce_tester.cpp ├── ibverbs_communicator.cpp ├── ibverbs_communicator.h ├── ibverbs_communicator_cuda.cpp ├── memory_pool.cpp ├── memory_pool.h └── util.h ├── mpinvcc.sh └── tests ├── allreduce_test.py ├── sendrecv_test.cpp └── unittest.cpp /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "grumpi"] 2 | path = grumpi 3 | url = https://github.com/keisukefukuda/grumpi.git 4 | [submodule "tinyexpr"] 5 | path = tinyexpr 6 | url = https://github.com/codeplea/tinyexpr.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project("ibcomm") 2 | 3 | # CMake version 4 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR) 5 | 6 | set(CMAKE_CXX_FLAGS "-Wno-missing-field-initializers -Wno-format-security -Wno-sign-compare") 7 | set(CMAKE_CXX_FLAGS_RELEASE "-O2") 8 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 9 | 10 | if (NOT CMAKE_BUILD_TYPE) 11 | message(STATUS "Setting CMAKE_BUILD_TYPE=Release (default)") 12 | set(CMAKE_BUILD_TYPE Release) 13 | endif() 14 | 15 | find_package(MPI) 16 | find_package(CUDA) 17 | 18 | # ibverbs 19 | find_library(IBVERBS_LIBRARY 20 | NAMES ibverbs libibverbs libibverbs.so 21 | HINTS ENV LD_LIBRARY_PATH) 22 | 23 | find_path(IBVERBS_INCLUDE_PATH 24 | NAMES "infiniband/verbs.h" 25 | HINTS ENV CPATH) 26 | 27 | # google test 28 | find_library(GOOGLETEST_MAIN_LIBRARY 29 | NAMES gtest_main libgtest_main libgtest_main.a 30 | HINTS 31 | ${GOOGLETEST_ROOT} 32 | ${GOOGLETEST_ROOT}/build 33 | ${GOOGLETEST_ROOT}/build/googlemock/gtest 34 | ) 35 | 36 | find_library(GOOGLETEST_LIBRARY 37 | NAMES gtest libgtest libgtest.a 38 | HINTS 39 | ${GOOGLETEST_ROOT} 40 | ${GOOGLETEST_ROOT}/build 41 | ${GOOGLETEST_ROOT}/build/googlemock/gtest 42 | ) 43 | 44 | find_path(GOOGLETEST_INCLUDE_PATH 45 | NAMES "gtest/gtest.h" 46 | HINTS 47 | ${GOOGLETEST_ROOT} 48 | ${GOOGLETEST_ROOT}/include 49 | ${GOOGLETEST_ROOT}/googletest/include) 50 | 51 | message(STATUS "GOOGLETEST_ROOT=${GOOGLETEST_ROOT}") 52 | message(STATUS "GOOGLETEST_LIBRARY=${GOOGLETEST_LIBRARY}") 53 | message(STATUS "GOOGLETEST_INCLUDE_PATH=${GOOGLETEST_INCLUDE_PATH}") 54 | 55 | # Run mpicxx -show to get compile flags for MPI 56 | set(CMAKE_CXX_COMPILER "${CMAKE_SOURCE_DIR}/mpinvcc.sh") 57 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter") 58 | 59 | include_directories(".") 60 | 61 | # for libraries 62 | if(CUDA_FOUND) 63 | add_library(ibcomm_cuda SHARED 64 | ibcomm/ibverbs_communicator.cpp 65 | ibcomm/ibverbs_communicator_cuda.cpp 66 | ibcomm/memory_pool.cpp 67 | ) 68 | if(USE_TRACE) 69 | target_compile_definitions(ibcomm_cuda 70 | PUBLIC "-DUSE_CUDA" 71 | PUBLIC "-DUSE_TRACE" 72 | ) 73 | else() 74 | target_compile_definitions(ibcomm_cuda 75 | PUBLIC "-DUSE_CUDA" 76 | ) 77 | endif() 78 | 79 | target_include_directories(ibcomm_cuda 80 | PUBLIC ${IBVERBS_INCLUDE_PATH} 81 | PUBLIC ${CUDA_INCLUDE_DIRS} 82 | ) 83 | target_link_libraries(ibcomm_cuda 84 | ${IBVERBS_LIBRARY} 85 | ${CUDA_LIBRARIES} 86 | ) 87 | endif() 88 | 89 | add_library(ibcomm SHARED ibcomm/ibverbs_communicator.cpp) 90 | if(USE_TRACE) 91 | target_compile_definitions(ibcomm 92 | PUBLIC "-DUSE_TRACE" 93 | ) 94 | endif() 95 | target_include_directories(ibcomm PUBLIC ${IBVERBS_INCLUDE_PATH}) 96 | target_link_libraries(ibcomm ${IBVERBS_LIBRARY}) 97 | 98 | # for tests 99 | add_executable(sendrecv tests/sendrecv_test.cpp) 100 | target_link_libraries(sendrecv ibcomm) 101 | 102 | if(GOOGLETEST_INCLUDE_PATH) 103 | add_executable(unittest tests/unittest.cpp) 104 | target_link_libraries(unittest ${GOOGLETEST_LIBRARY} ${GOOGLETEST_MAIN_LIBRARY}) 105 | target_include_directories(unittest PUBLIC ${GOOGLETEST_INCLUDE_PATH}) 106 | target_include_directories(unittest PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 107 | endif() 108 | 109 | add_library(tinyexpr tinyexpr/tinyexpr.c) 110 | 111 | if(CUDA_FOUND) 112 | add_executable(allreduce_tester ibcomm/allreduce_tester.cpp) 113 | target_include_directories(allreduce_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/tinyexpr) 114 | target_include_directories(allreduce_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/grumpi) 115 | target_link_libraries(allreduce_tester ibcomm_cuda) 116 | target_link_libraries(allreduce_tester tinyexpr) 117 | endif() 118 | 119 | # for allreduce examples 120 | add_executable(allreduce examples/allreduce.cpp) 121 | target_link_libraries(allreduce ibcomm) 122 | set_target_properties(allreduce PROPERTIES 123 | RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples" 124 | ) 125 | 126 | if(CUDA_FOUND) 127 | add_executable(allreduce_cuda examples/allreduce.cpp) 128 | target_compile_definitions(allreduce_cuda 129 | PUBLIC "-DUSE_CUDA" 130 | ) 131 | target_link_libraries(allreduce_cuda ibcomm_cuda) 132 | set_target_properties(allreduce_cuda PROPERTIES 133 | RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples" 134 | ) 135 | endif() 136 | -------------------------------------------------------------------------------- /CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | exclude_files=tinyexpr/* 2 | exclude_files=build/* 3 | exclude_files=grumpi/* 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2018 Preferred Networks, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PFNProto: AllReduce prototype implementation for NVIDIA GPUs and InfiniBand 2 | PFNProto is a prototype library of the AllReduce collective operation. 3 | The library is highly optimized for widely-used deep learning clusters equipped with NVIDIA GPUs and InfiniBand interconnect. It can achieve competitive performance to the fastest libraries in the world including NVIDIA NCCL. 4 | 5 | PFNProto implements the following algorithms. 6 | - Ring-AllReduce for CPU / CUDA 7 | - Rabenseifner's Algorithm for CPU / CUDA 8 | 9 | For more details, please refer to our blog ([English](https://preferredresearch.jp/2018/07/10/technologies-behind-distributed-deep-learning-allreduce/), [Japanese](https://research.preferred.jp/2018/07/prototype-allreduce-library/)) 10 | 11 | # How to build 12 | ## Dependencies 13 | - Infiniband Verbs 14 | - CMake 2.8+ 15 | - MPI (to build examples and tests) 16 | 17 | ## Build 18 | ```sh 19 | mkdir build 20 | cd build 21 | cmake .. 22 | make 23 | ``` 24 | 25 | # How to try the example 26 | - Requirements: multi-node computing cluster equipped with NVIDIA GPUs and InfiniBand. 27 | - Prepare hostfile 28 | - Execute `examples/allreduce_cuda` 29 | - When PPN is not 1, this implementation isn't optimized. Therefore, selecting PPN=1 is needed. 30 | - (for Open MPI users) : `mpiexec -N 1 --hostfile "path_to_hostfile" examples/allreduce_cuda` 31 | - (for MPICH / MVAPICH2 users) : `mpiexec --ppn 1 --hostfile "path_to_hostfile" examples/allreduce_cuda` 32 | - You'll see below result 33 | ``` 34 | $ cd build 35 | $ (prepare your hostfile) 36 | $ cat hosts 37 | node01 38 | node02 39 | ... 40 | node08 41 | $ mpiexec -N 1 --hostfile hosts examples/allreduce_cuda 42 | rank: 3 OK 43 | rank: 2 OK 44 | rank: 4 OK 45 | rank: 5 OK 46 | rank: 6 OK 47 | rank: 7 OK 48 | rank: 1 OK 49 | rank: 0 OK 50 | elapsed time : 9.750750e-02 [s] 51 | ``` 52 | 53 | # Contribution 54 | Any contributions to this prototype are welcome. 55 | Please feel free to report an issue or send a pull request! 56 | 57 | # Limitations 58 | This library is a prototype for algorithm and performance demonstration purpose. 59 | It is not intended to be used in a production environment. 60 | 61 | In particular, there are several limitations including: 62 | - Python binding is not provided. 63 | - Not being designed to be used together with ChainerMN. 64 | - Supported reduction operation is only plus(+). 65 | - Non-power-of-two extension of Rabenseifner's algorithm is not implemented. 66 | - It currently focuses on inter-node communication. Intra-node communication is not efficient because shared memory or GPU-to-GPU DMA data transfer is not implemented. 67 | 68 | # Tuning Knobs 69 | You can control runtime behaviours of PFNProto through the following environment variables. 70 | Memory or buffer size are all in [byte] and SI prefix is not supported. 71 | 72 | ## IBCOMM_CHUNKSIZE 73 | - Chunk size in bytes for the AllReduce algorithm (both Ring-AllReduce and Rabenseifner's). PFNProto uses this size as a unit to execute every pipeline operation such as send, recv and reduction. 74 | - Default value: (len(send/recvbuf) in bytes) / (4 * N_OF_PROCESSES) 75 | - Supported range: (IBCOMM_CHUNKSIZE) <= (len(send/recvbuf) in bytes), / (2 * N_OF_PROCESSES). 76 | 77 | ## IBCOMM_MEMORY_POOL_PRE_ALLOC 78 | - PFNProto allocates this size of MemoryPool to hide the latency of memory allocation. 79 | - Default value: 67108864 (64 MiB). 80 | 81 | ## IBCOMM_WORK_GPU_MEMORY_SIZE 82 | - Initial Working GPU Memory size. PFNProto needs Working GPU Memory in rabenseifner's algorithm to save a reduction result. If this size is smaller than `IBCOMM_CHUNKSIZE`, runtime memory reallocation occurs. 83 | - Default value: 33554432 (32 MiB). 84 | 85 | ## IBCOMM_NUM_CUDA_STREAM 86 | - Total number of CUDA streams used. 87 | - Default value: 64 88 | 89 | ## Number of pre-allocated chunks 90 | - Number of pre-allocated chunks is computed from `IBCOMM_CHUNKSIZE` and `IBCOMM_MEMORY_POOL_PRE_ALLOC` by following equation: 91 | - (IBCOMM_MEMORY_POOL_PRE_ALLOC) / (IBCOMM_CHUNKSIZE) 92 | 93 | - Example: Let IBCOMM_CHUNKSIZE be 4 [MiB], and IBCOMM_MEMORY_POOL_PRE_ALLOC be 64 [MiB]. (IBCOMM_MEMORY_POOL_PRE_ALLOC) / (IBCOMM_CHUNKSIZE) = 64 / 4 = 16. Therefore, 16 chunks will be allocated and size of chunk is 4 [MiB]. 94 | 95 | # APIs 96 | ```c++ 97 | class IBVerbsCommunicator { 98 | public: 99 | explicit IBVerbsCommunicator(int world_size); 100 | 101 | // Manages infiniband-related resources thus we need to delete copy and move ctors. 102 | IBVerbsCommunicator(const IBVerbsCommunicator&) noexcept = delete; 103 | IBVerbsCommunicator& operator=(const IBVerbsCommunicator&) noexcept = delete; 104 | 105 | // move 106 | IBVerbsCommunicator(IBVerbsCommunicator&&) noexcept = delete; 107 | IBVerbsCommunicator& operator=(IBVerbsCommunicator&&) noexcept = delete; 108 | 109 | // connection management 110 | struct ProcessInfo RegisterProcess(int dest_rank, struct ProcessInfo pinfo); 111 | struct ProcessInfo CreateQueuePair(int dest_rank); 112 | void RegisterQueuePair(int dest_rank, struct ProcessInfo pinfo); 113 | void RegisterMyself(int my_rank); 114 | 115 | void Send(int dest_rank, const void* buf, size_t len, bool blocking = true); 116 | void Recv(int src_rank, void* buf, size_t len, bool blocking = true); 117 | 118 | // wait ( for non-blocking io ) 119 | bool SendPoll(int dest_rank); 120 | bool RecvPoll(int src_rank); 121 | void SendWait(int dest_rank); 122 | void RecvWait(int src_rank); 123 | 124 | // allreduce 125 | template 126 | void AllreduceRing(const T* sendbuf, T* recvbuf, size_t len_elements); 127 | 128 | template 129 | void AllreduceRabenseifner(const T* sendbuf, T* recvbuf, size_t len_elements); 130 | 131 | template 132 | void AllreduceRingCuda(const T* sendbuf, T* recvbuf, size_t len_elements); 133 | 134 | template 135 | void AllreduceRabenseifnerCuda(const T* sendbuf, T* recvbuf, size_t len_elements); 136 | 137 | int my_rank_; 138 | size_t world_size_; 139 | }; 140 | ``` 141 | 142 | ## Error Codes 143 | Error codes are defined at `ibcomm/util.h`. 144 | When error occurs, ibcomm returns these values as an exit code. 145 | 146 | ```cpp 147 | enum class IBCOMM_ERROR_CODE : int { 148 | INVALID_ARGUMENT = 1, 149 | 150 | // Error occured in InfiniBand Verbs call. 151 | IBVERBS_ERROR = 2, 152 | 153 | // Error occured in CUDA call. 154 | CUDA_ERROR = 3, 155 | 156 | NOT_SUPPORTED = 4 157 | }; 158 | ``` 159 | 160 | # How to run unit tests 161 | ## Setup allreduce integration tests 162 | Integration tests of allreduce routines are implemented using `pytest` module. 163 | 164 | ``` 165 | $ pip install pytest 166 | ``` 167 | 168 | Unit tests depend on a few external libraries. 169 | 170 | ``` 171 | $ cd `Your cloned directory` 172 | $ git submodule init 173 | $ git submodule update 174 | $ mkdir -p build 175 | $ cd build 176 | $ cmake .. 177 | $ make -j 178 | 179 | # Make sure `allreduce_tester` is generated 180 | ``` 181 | 182 | ## Run allreduce integration tests 183 | ``` 184 | $ cd `Your cloned directory` 185 | $ pytest 186 | $ export HOSTFILE=hostfile # Optional 187 | $ pytest --capture=no # For more info 188 | $ pytest --capture=no -m "not slow" # Skip aging test 189 | ``` 190 | 191 | ## Setup and run C++ unit tests 192 | C++ unit tests depend on Google test (https://github.com/google/googletest). 193 | 194 | First, download and build Google test. 195 | 196 | ``` 197 | $ WORKING_DIR=/tmp # Your favorite directory 198 | $ cd ${WORKING_DIR} 199 | $ git clone https://github.com/google/googletest.git 200 | $ cd googletest 201 | $ mkdir build 202 | $ cmake .. 203 | 204 | $ cd `Your cloned directory` 205 | $ cd build 206 | $ cmake -D GOOGLETEST_ROOT=${WORKING_DIR}/googletest .. 207 | $ make 208 | $ ./unittest 209 | 210 | Running main() from 211 | [==========] Running 4 tests from 1 test case. 212 | [----------] Global test environment set-up. 213 | [----------] 4 tests from IBCommUtilTest 214 | [ RUN ] IBCommUtilTest.ParseNumberZero 215 | [ OK ] IBCommUtilTest.ParseNumberZero (0 ms) 216 | [ RUN ] IBCommUtilTest.ParseNumberPositive 217 | [ OK ] IBCommUtilTest.ParseNumberPositive (0 ms) 218 | [ RUN ] IBCommUtilTest.ParseNumberMalformed 219 | [ OK ] IBCommUtilTest.ParseNumberMalformed (0 ms) 220 | [ RUN ] IBCommUtilTest.get_exp_of_two 221 | [ OK ] IBCommUtilTest.get_exp_of_two (0 ms) 222 | [----------] 4 tests from IBCommUtilTest (0 ms total) 223 | 224 | [----------] Global test environment tear-down 225 | [==========] 4 tests from 1 test case ran. (0 ms total) 226 | [ PASSED ] 4 tests. 227 | ``` 228 | 229 | # Coding guideline 230 | We adopt Google C++ Style Guide ( https://google.github.io/styleguide/cppguide.html ). 231 | 232 | ``` 233 | $ pip install cpplint 234 | $ cpplint --recursive . 235 | ``` 236 | 237 | # Acknowledgements 238 | We would like to thank Mr. Minoru Nakamura for his comprehensive document 239 | on Infiniband Verbs API. (http://www.nminoru.jp/~nminoru/network/infiniband/) (In Japanese) 240 | 241 | # LICENSE 242 | MIT License 243 | -------------------------------------------------------------------------------- /examples/allreduce.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef USE_CUDA 11 | #include 12 | #include 13 | #include 14 | #include 15 | #endif 16 | 17 | #include "ibcomm/ibverbs_communicator.h" 18 | #include "ibcomm/util.h" // CUDACHECK 19 | 20 | // 256 [MiB] of allreduce float32 vector 21 | #define ARRAY_LENGTH 67108864 22 | 23 | // processes per node 24 | // PPN must be 1 in this prototype implementation 25 | // When PPN is not 1, this implementation isn't optimized. 26 | #define PPN 1 27 | 28 | // warmup times 29 | #define WARMUP 3 30 | 31 | double inline GetTime() { 32 | struct timespec ts; 33 | clock_gettime(CLOCK_MONOTONIC_RAW, &ts); 34 | 35 | return ts.tv_sec + 1e-9 * ts.tv_nsec; 36 | } 37 | 38 | int main(int argc, char* argv[]) { 39 | MPI_Init(&argc, &argv); 40 | 41 | int mpi_rank, mpi_size; 42 | MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); 43 | MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); 44 | 45 | #ifdef USE_CUDA 46 | int intra_rank = mpi_rank % PPN; 47 | CUDACHECK(cudaSetDevice(intra_rank)); 48 | #endif 49 | 50 | IBVerbsCommunicator comm(mpi_size); 51 | 52 | std::vector qps(mpi_size * 3); 53 | 54 | for (int i = 0; i < mpi_size; i++) { 55 | if (i == mpi_rank) { 56 | continue; 57 | } 58 | ProcessInfo pinfo = comm.CreateQueuePair(i); 59 | qps[i * 3 + 0] = pinfo.lid; 60 | qps[i * 3 + 1] = pinfo.qp_n; 61 | qps[i * 3 + 2] = pinfo.psn; 62 | } 63 | 64 | MPI_Alltoall(MPI_IN_PLACE, 3, MPI_UINT32_T, qps.data(), 3, MPI_UINT32_T, 65 | MPI_COMM_WORLD); 66 | 67 | for (int i = 0; i < mpi_size; i++) { 68 | if (i == mpi_rank) { 69 | comm.RegisterMyself(i); 70 | } else { 71 | ProcessInfo pinfo; 72 | pinfo.lid = qps[i * 3 + 0]; 73 | pinfo.qp_n = qps[i * 3 + 1]; 74 | pinfo.psn = qps[i * 3 + 2]; 75 | comm.RegisterQueuePair(i, pinfo); 76 | } 77 | } 78 | 79 | #ifdef USE_CUDA 80 | thrust::host_vector sendbuf(ARRAY_LENGTH); 81 | thrust::host_vector recvbuf(ARRAY_LENGTH); 82 | #else 83 | std::vector sendbuf(ARRAY_LENGTH); 84 | std::vector recvbuf(ARRAY_LENGTH); 85 | #endif 86 | 87 | // fixed seed 88 | std::mt19937 mt(0); 89 | // To avoid overflow, use short range of number. 90 | std::uniform_int_distribution rand(-1000, 1000); 91 | 92 | for (int i = 0; i < ARRAY_LENGTH; i++) { 93 | sendbuf[i] = rand(mt); 94 | } 95 | 96 | std::vector answer(ARRAY_LENGTH, 0); 97 | MPI_Allreduce(sendbuf.data(), answer.data(), ARRAY_LENGTH, MPI_FLOAT, MPI_SUM, 98 | MPI_COMM_WORLD); 99 | 100 | double start, end; 101 | 102 | #ifdef USE_CUDA 103 | thrust::device_vector gpu_sendbuf(ARRAY_LENGTH); 104 | thrust::device_vector gpu_recvbuf(ARRAY_LENGTH); 105 | gpu_sendbuf = sendbuf; 106 | 107 | for (int i = 0; i < WARMUP + 1; i++) { 108 | start = GetTime(); 109 | comm.AllreduceRingCuda(thrust::raw_pointer_cast(gpu_sendbuf.data()), 110 | thrust::raw_pointer_cast(gpu_recvbuf.data()), 111 | ARRAY_LENGTH); 112 | /* 113 | comm.AllreduceRabenseifnerCuda(thrust::raw_pointer_cast(gpu_sendbuf.data()), 114 | thrust::raw_pointer_cast(gpu_recvbuf.data()), 115 | ARRAY_LENGTH); 116 | */ 117 | end = GetTime(); 118 | } 119 | 120 | recvbuf = gpu_recvbuf; 121 | #else 122 | for (int i = 0; i < WARMUP + 1; i++) { 123 | start = GetTime(); 124 | comm.AllreduceRing(sendbuf.data(), recvbuf.data(), ARRAY_LENGTH); 125 | // comm.AllreduceRabenseifner(sendbuf.data(), recvbuf.data(), ARRAY_LENGTH); 126 | end = GetTime(); 127 | } 128 | #endif 129 | 130 | bool ok = true; 131 | for (int i = 0; i < ARRAY_LENGTH; i++) { 132 | if (fabs(answer[i] - recvbuf[i]) > 1e-6) { 133 | ok = false; 134 | std::cout << "wrong at " << mpi_rank << " rank " << i << " element " 135 | << answer[i] << ":" << recvbuf[i] << std::endl; 136 | } else { 137 | // std::cerr << "ok at " << mpi_rank << " rank " << i << " 138 | // element " << answer[i] << ":" << recvbuf[i] << std::endl; 139 | } 140 | } 141 | 142 | std::cout << "rank: " << mpi_rank << (ok ? " OK" : " FAIL") << std::endl; 143 | 144 | MPI_Finalize(); 145 | 146 | if (mpi_rank == 0) printf("elapsed time : %e [s]\n", end - start); 147 | 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /ibcomm/allreduce_cpu_impl.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "ibcomm/ibverbs_communicator.h" 14 | #include "ibcomm/util.h" 15 | 16 | template 17 | void _reduce_inplace(T* result, const T* value, size_t len_elements) { 18 | for (size_t i = 0; i < len_elements; i++) { 19 | result[i] += value[i]; 20 | } 21 | } 22 | 23 | template 24 | void IBVerbsCommunicator::AllreduceRing(const T* sendbuf, T* recvbuf, 25 | size_t len_elements) { 26 | if (world_size_ == 1) { 27 | memcpy(recvbuf, sendbuf, sizeof(T) * len_elements); 28 | return; 29 | } 30 | 31 | auto ranges = SplitBuffer(len_elements, sizeof(T)); 32 | auto rank_to_chunk = GetRankToChunk(ranges); 33 | 34 | auto chunks = ranges.size(); 35 | 36 | int from_rank = (my_rank_ - 1 + world_size_) % world_size_; 37 | int to_rank = (my_rank_ + 1) % world_size_; 38 | 39 | std::queue reg_q; 40 | for (int i = 0; i < world_size_ - 1; i++) { 41 | // [my_rank_ - 1, my_rank_) 42 | int rank = (my_rank_ - i - 1 + world_size_) % world_size_; 43 | 44 | for (auto it = rank_to_chunk[rank].rbegin(); 45 | it != rank_to_chunk[rank].rend(); ++it) { 46 | reg_q.push(*it); 47 | } 48 | } 49 | 50 | // send_mrs' mr needs SendPoll before deregistration 51 | // (used in ReduceScatter, AllGather). 52 | // However, recv_mrs' mr can deregistration immediately (used in AllGather). 53 | std::queue send_mrs; 54 | std::queue recv_mrs; 55 | 56 | // cached mrs 57 | std::vector mrs(chunks, NULL); 58 | 59 | // HCA's Q length 60 | int send_q_elements = 0; 61 | int recv_q_elements = 0; 62 | 63 | std::queue first_send_q; 64 | std::queue first_send_q_buffering; 65 | 66 | for (auto it = rank_to_chunk[my_rank_].rbegin(); 67 | it != rank_to_chunk[my_rank_].rend(); ++it) { 68 | first_send_q.push(*it); 69 | } 70 | 71 | int current_recv_i = reg_q.front(); 72 | bool reduce_scatter_done = false; 73 | 74 | // ReduceScatter 75 | while (!reduce_scatter_done) { 76 | while ((reg_q.empty() || recv_q_elements > 0) && RecvPoll(from_rank)) { 77 | recv_q_elements--; 78 | 79 | auto range = ranges[current_recv_i]; 80 | size_t offset_elements = range.first; 81 | size_t elements = (range.second - range.first); 82 | size_t bytes = elements * sizeof(T); 83 | 84 | _reduce_inplace(recvbuf + offset_elements, sendbuf + offset_elements, 85 | elements); 86 | 87 | if (current_recv_i == rank_to_chunk[to_rank].front()) { 88 | reduce_scatter_done = true; 89 | } 90 | 91 | if (!(rank_to_chunk[to_rank].front() <= current_recv_i && 92 | current_recv_i <= rank_to_chunk[to_rank].back())) { 93 | if (first_send_q.empty() && first_send_q_buffering.empty()) { 94 | SendRegistered(to_rank, recvbuf + offset_elements, 95 | mrs[current_recv_i], bytes, false); 96 | send_q_elements++; 97 | } else { 98 | first_send_q_buffering.push(current_recv_i); 99 | } 100 | } 101 | 102 | current_recv_i = (current_recv_i - 1 + chunks) % chunks; 103 | } 104 | 105 | if (!reg_q.empty()) { 106 | int recv_key = reg_q.front(); 107 | reg_q.pop(); 108 | 109 | auto range = ranges[recv_key]; 110 | size_t offset_elements = range.first; 111 | size_t elements = (range.second - range.first); 112 | size_t bytes = elements * sizeof(T); 113 | 114 | mrs[recv_key] = RegisterRecvBuf(recvbuf + offset_elements, bytes); 115 | 116 | RecvRegistered(from_rank, recvbuf + offset_elements, mrs[recv_key], bytes, 117 | false); 118 | recv_q_elements++; 119 | } 120 | 121 | if (!first_send_q.empty()) { 122 | int i = first_send_q.front(); 123 | first_send_q.pop(); 124 | 125 | auto range = ranges[i]; 126 | 127 | size_t offset_elements = range.first; 128 | size_t elements = (range.second - range.first); 129 | size_t bytes = elements * sizeof(T); 130 | 131 | auto mr = RegisterSendBuf(sendbuf + offset_elements, bytes); 132 | 133 | SendRegistered(to_rank, sendbuf + offset_elements, mr, bytes, false); 134 | send_q_elements++; 135 | send_mrs.push(mr); 136 | } else { 137 | while (!first_send_q_buffering.empty()) { 138 | int i = first_send_q_buffering.front(); 139 | first_send_q_buffering.pop(); 140 | 141 | auto range = ranges[i]; 142 | size_t offset_elements = range.first; 143 | size_t elements = (range.second - range.first); 144 | size_t bytes = elements * sizeof(T); 145 | 146 | SendRegistered(to_rank, recvbuf + offset_elements, mrs[i], bytes, 147 | false); 148 | send_q_elements++; 149 | } 150 | } 151 | 152 | while (SendPoll(to_rank)) { 153 | send_q_elements--; 154 | if (!send_mrs.empty()) { 155 | PopMrAndDereg(&send_mrs); 156 | } 157 | } 158 | 159 | for (auto it = rank_to_chunk[my_rank_].begin(); 160 | it != rank_to_chunk[my_rank_].end(); ++it) { 161 | auto range = ranges[*it]; 162 | size_t offset_elements = range.first; 163 | size_t elements = (range.second - range.first); 164 | size_t bytes = elements * sizeof(T); 165 | 166 | if (mrs[*it] == NULL) { 167 | mrs[*it] = RegisterRecvBuf(recvbuf + offset_elements, bytes); 168 | 169 | // when 1 chunk is registered, exit this loop to recv early. 170 | break; 171 | } 172 | } 173 | } 174 | 175 | // need sync before AllGather 176 | assert(recv_q_elements == 0); 177 | while (send_q_elements != 0) { 178 | SendWait(to_rank); 179 | send_q_elements--; 180 | 181 | if (!send_mrs.empty()) { 182 | PopMrAndDereg(&send_mrs); 183 | } 184 | } 185 | 186 | // AllGather 187 | for (int i = 0; i < world_size_; i++) { 188 | int rank = (1 + my_rank_ - i + world_size_) % world_size_; 189 | 190 | for (auto it = rank_to_chunk[rank].rbegin(); 191 | it != rank_to_chunk[rank].rend(); ++it) { 192 | auto range = ranges[*it]; 193 | size_t offset_elements = range.first; 194 | size_t elements = (range.second - range.first); 195 | size_t bytes = elements * sizeof(T); 196 | 197 | if (rank != (my_rank_ + 1) % world_size_) { 198 | RecvRegistered(from_rank, recvbuf + offset_elements, mrs[*it], bytes, 199 | false); 200 | 201 | while (!RecvPoll(from_rank)) { 202 | if (SendPoll(to_rank)) { 203 | send_q_elements--; 204 | 205 | assert(!send_mrs.empty()); 206 | 207 | PopMrAndDereg(&send_mrs); 208 | } else if (!recv_mrs.empty()) { 209 | PopMrAndDereg(&recv_mrs); 210 | } 211 | } 212 | } 213 | 214 | if (rank != (my_rank_ + 2) % world_size_) { 215 | SendRegistered(to_rank, recvbuf + offset_elements, mrs[*it], bytes, 216 | false); 217 | send_mrs.push(mrs[*it]); 218 | send_q_elements++; 219 | } else { 220 | recv_mrs.push(mrs[*it]); 221 | } 222 | 223 | mrs[*it] = NULL; 224 | } 225 | } 226 | 227 | while (send_q_elements != 0) { 228 | SendWait(to_rank); 229 | send_q_elements--; 230 | 231 | if (!send_mrs.empty()) { 232 | PopMrAndDereg(&send_mrs); 233 | } 234 | } 235 | assert(send_mrs.empty()); 236 | 237 | assert(recv_q_elements == 0); 238 | 239 | while (!recv_mrs.empty()) { 240 | PopMrAndDereg(&recv_mrs); 241 | } 242 | 243 | return; 244 | } 245 | 246 | template 247 | void IBVerbsCommunicator::AllreduceRabenseifner(const T* sendbuf, T* recvbuf, 248 | size_t len_elements) { 249 | if (world_size_ == 1) { 250 | memcpy(recvbuf, sendbuf, sizeof(T) * len_elements); 251 | return; 252 | } 253 | 254 | int world_size_exp = util::GetExpOfTwo(world_size_); 255 | 256 | // check world_size is power-of-2 or not 257 | if (world_size_exp == 0) { 258 | util::IbcommError(__FILE__, __LINE__, 259 | util::IBCOMM_ERROR_CODE::NOT_SUPPORTED, 260 | "Currently, rabenseifner's algorithm doesn't support " 261 | "non-power-of-2 processes."); 262 | } 263 | 264 | std::vector> ranges; 265 | for (int i = 0; i < world_size_; i++) { 266 | int range_length = util::ceilDiv(len_elements, world_size_); 267 | 268 | ranges.emplace_back( 269 | range_length * i, 270 | std::min(range_length * (i + 1), static_cast(len_elements))); 271 | } 272 | 273 | T* tmp_buffer = static_cast(malloc(sizeof(T) * len_elements)); 274 | if (tmp_buffer == NULL) { 275 | std::cerr << "Allocation of tmp-buffer failed" << std::endl; 276 | return; 277 | } 278 | 279 | memcpy(recvbuf, sendbuf, sizeof(T) * len_elements); 280 | 281 | // process maintains current chunk_range [start_chunk, end_chunk). 282 | int start_chunk = 0; 283 | int end_chunk = ranges.size(); 284 | // Reduce-Scatter (recursive halving) 285 | for (int step = 0; step < world_size_exp; step++) { 286 | int to_rank = my_rank_ ^ (1 << step); 287 | 288 | int send_chunk_start, send_chunk_end, recv_chunk_start, recv_chunk_end; 289 | if (my_rank_ < to_rank) { 290 | // I send front chunk 291 | send_chunk_start = start_chunk; 292 | send_chunk_end = end_chunk - (end_chunk - start_chunk) / 2; 293 | recv_chunk_start = send_chunk_end; 294 | recv_chunk_end = end_chunk; 295 | } else { 296 | // I send back chunk 297 | recv_chunk_start = start_chunk; 298 | recv_chunk_end = end_chunk - (end_chunk - start_chunk) / 2; 299 | send_chunk_start = recv_chunk_end; 300 | send_chunk_end = end_chunk; 301 | } 302 | 303 | Send(to_rank, recvbuf + ranges[send_chunk_start].first, 304 | sizeof(T) * 305 | (ranges[(send_chunk_end - 1 + ranges.size()) % ranges.size()] 306 | .second - 307 | ranges[send_chunk_start].first), 308 | false); 309 | 310 | Recv(to_rank, tmp_buffer + ranges[recv_chunk_start].first, 311 | sizeof(T) * 312 | (ranges[(recv_chunk_end - 1 + ranges.size()) % ranges.size()] 313 | .second - 314 | ranges[recv_chunk_start].first), 315 | false); 316 | 317 | RecvWait(to_rank); 318 | 319 | _reduce_inplace( 320 | recvbuf + ranges[recv_chunk_start].first, 321 | tmp_buffer + ranges[recv_chunk_start].first, 322 | ranges[(recv_chunk_end - 1 + ranges.size()) % ranges.size()].second - 323 | ranges[recv_chunk_start].first); 324 | 325 | SendWait(to_rank); 326 | 327 | start_chunk = recv_chunk_start; 328 | end_chunk = recv_chunk_end; 329 | } 330 | 331 | // AllGather (recursive doubling) 332 | for (int step = 0; step < world_size_exp; step++) { 333 | int to_rank = my_rank_ ^ (1 << (world_size_exp - step - 1)); 334 | 335 | int send_chunk_start, send_chunk_end, recv_chunk_start, recv_chunk_end; 336 | if (my_rank_ > to_rank) { 337 | // I send front chunk 338 | send_chunk_start = start_chunk; 339 | send_chunk_end = end_chunk; 340 | recv_chunk_start = send_chunk_end; 341 | recv_chunk_end = recv_chunk_start + end_chunk - start_chunk; 342 | } else { 343 | // I send back chunk 344 | send_chunk_start = start_chunk; 345 | send_chunk_end = end_chunk; 346 | recv_chunk_end = send_chunk_start; 347 | recv_chunk_start = recv_chunk_end - (end_chunk - start_chunk); 348 | } 349 | 350 | Send(to_rank, recvbuf + ranges[send_chunk_start].first, 351 | sizeof(T) * 352 | (ranges[(send_chunk_end - 1 + ranges.size()) % ranges.size()] 353 | .second - 354 | ranges[send_chunk_start].first), 355 | false); 356 | 357 | Recv(to_rank, recvbuf + ranges[recv_chunk_start].first, 358 | sizeof(T) * 359 | (ranges[(recv_chunk_end - 1 + ranges.size()) % ranges.size()] 360 | .second - 361 | ranges[recv_chunk_start].first), 362 | false); 363 | 364 | RecvWait(to_rank); 365 | SendWait(to_rank); 366 | 367 | start_chunk = std::min(send_chunk_start, recv_chunk_start); 368 | end_chunk = std::max(send_chunk_end, recv_chunk_end); 369 | } 370 | 371 | free(tmp_buffer); 372 | } 373 | -------------------------------------------------------------------------------- /ibcomm/allreduce_cuda_impl.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #pragma once 4 | 5 | #ifdef USE_CUDA 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "ibcomm/ibverbs_communicator.h" 15 | #include "ibcomm/memory_pool.h" 16 | #include "ibcomm/util.h" 17 | 18 | #ifdef USE_TRACE 19 | #define TRACE(NAME) util::trace(&NAME); 20 | #else 21 | #define TRACE(NAME) 22 | #endif 23 | 24 | #define THREADS 512 25 | 26 | template 27 | __global__ void _reduce_inplace_cuda(T* result, const T* value, 28 | size_t len_elements) { 29 | #pragma unroll 30 | for (auto index = blockDim.x * blockIdx.x + threadIdx.x; index < len_elements; 31 | index += blockDim.x * gridDim.x) 32 | result[index] += value[index]; 33 | } 34 | 35 | template 36 | void IBVerbsCommunicator::AllreduceRingCuda(const T* sendbuf, T* recvbuf, 37 | size_t len_elements) { 38 | TRACE(trace_other_); 39 | 40 | if (world_size_ == 1) { 41 | CUDACHECK(cudaMemcpy(recvbuf, sendbuf, sizeof(T) * len_elements, 42 | cudaMemcpyDefault)); 43 | TRACE(trace_other_); 44 | 45 | return; 46 | } 47 | 48 | auto ranges = SplitBuffer(len_elements, sizeof(T)); 49 | auto rank_to_chunk = GetRankToChunk(ranges); 50 | 51 | auto chunks = ranges.size(); 52 | 53 | int from_rank = (my_rank_ - 1 + world_size_) % world_size_; 54 | int to_rank = (my_rank_ + 1) % world_size_; 55 | 56 | auto controller = 57 | pool_->GetController((ranges[0].second - ranges[0].first) * sizeof(T)); 58 | 59 | std::vector chunk_to_memory(chunks, NULL); 60 | 61 | std::queue reg_q; 62 | // Reduce-Scatter's recv 63 | for (int i = 0; i < world_size_ - 1; i++) { 64 | // [my_rank_ - 1, my_rank_) 65 | int rank = (my_rank_ - i - 1 + world_size_) % world_size_; 66 | 67 | for (auto it = rank_to_chunk[rank].rbegin(); 68 | it != rank_to_chunk[rank].rend(); ++it) { 69 | reg_q.push(*it); 70 | } 71 | } 72 | 73 | // AllGather's recv 74 | for (int i = 0; i < world_size_ - 1; i++) { 75 | // [my_rank_, my_rank_ -1, ..., my_rank_ + 1) 76 | int rank = (my_rank_ - i + world_size_) % world_size_; 77 | for (auto it = rank_to_chunk[rank].rbegin(); 78 | it != rank_to_chunk[rank].rend(); ++it) { 79 | reg_q.push(*it); 80 | } 81 | } 82 | 83 | std::queue first_send_q; 84 | std::queue first_send_q_buffering; 85 | 86 | for (auto it = rank_to_chunk[my_rank_].rbegin(); 87 | it != rank_to_chunk[my_rank_].rend(); ++it) { 88 | first_send_q.push(*it); 89 | } 90 | 91 | int current_recv_i = reg_q.front(); 92 | 93 | std::queue wait_send_q; 94 | std::queue wait_reduction_q; 95 | std::queue wait_send_completion_q; 96 | int remaining_recv_q_length = 0; 97 | bool reduce_scatter_phase = true; 98 | // last rank (end of allgather) 99 | const int final_rank = (my_rank_ + 2) % world_size_; 100 | 101 | TRACE(trace_other_); 102 | 103 | while (true) { 104 | while ((reduce_scatter_phase || wait_reduction_q.empty()) && 105 | RecvPoll(from_rank)) { 106 | TRACE(trace_received_); 107 | 108 | remaining_recv_q_length--; 109 | auto range = ranges[current_recv_i]; 110 | size_t offset_elements = range.first; 111 | size_t elements = (range.second - range.first); 112 | size_t bytes = elements * sizeof(T); 113 | 114 | using util::ceilDiv; 115 | 116 | const auto blocks = 117 | std::min(ceilDiv(elements, (size_t)THREADS), (size_t)(65535)); 118 | 119 | auto& mem = chunk_to_memory[current_recv_i]; 120 | 121 | TRACE(trace_received_); 122 | 123 | if (reduce_scatter_phase) { 124 | TRACE(trace_issue_redu_kernel_); 125 | } else { 126 | TRACE(trace_issue_copy_kernel_); 127 | } 128 | 129 | CUDACHECK(cudaMemcpyAsync(recvbuf + offset_elements, mem->ptr(), bytes, 130 | cudaMemcpyDefault, mem->stream())); 131 | if (reduce_scatter_phase) { 132 | _reduce_inplace_cuda<<stream()>>>( 133 | recvbuf + offset_elements, sendbuf + offset_elements, elements); 134 | CUDACHECK(cudaMemcpyAsync(mem->ptr(), recvbuf + offset_elements, bytes, 135 | cudaMemcpyDefault, mem->stream())); 136 | wait_reduction_q.push(current_recv_i); 137 | if (current_recv_i == rank_to_chunk[to_rank].front()) { 138 | reduce_scatter_phase = false; 139 | } 140 | 141 | TRACE(trace_issue_redu_kernel_); 142 | } else { 143 | TRACE(trace_issue_copy_kernel_); 144 | if (current_recv_i < rank_to_chunk[final_rank].front() || 145 | rank_to_chunk[final_rank].back() < current_recv_i) { 146 | TRACE(trace_issue_send_); 147 | 148 | SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false); 149 | wait_send_completion_q.push(current_recv_i); 150 | 151 | TRACE(trace_issue_send_); 152 | } else { 153 | // NO NEED SEND because this is last allgather step. 154 | CUDACHECK(cudaStreamSynchronize(mem->stream())); 155 | controller.returnMemory(mem); 156 | mem = NULL; 157 | } 158 | } 159 | current_recv_i = (current_recv_i - 1 + chunks) % chunks; 160 | } 161 | 162 | // This means rank_to_chunk[final_rank].front() == current_recv_i in 163 | // RecvPoll loop (current_recv_i is already decremented) 164 | if (!reduce_scatter_phase && wait_reduction_q.empty() && 165 | current_recv_i == 166 | (rank_to_chunk[final_rank].front() - 1 + chunks) % chunks) 167 | break; // DONE! 168 | 169 | if (!wait_reduction_q.empty() && 170 | cudaStreamQuery(chunk_to_memory[wait_reduction_q.front()]->stream()) == 171 | cudaSuccess) { 172 | TRACE(trace_reduced_); 173 | 174 | int i = wait_reduction_q.front(); 175 | wait_reduction_q.pop(); 176 | 177 | auto range = ranges[i]; 178 | size_t elements = (range.second - range.first); 179 | size_t bytes = elements * sizeof(T); 180 | 181 | TRACE(trace_reduced_); 182 | // This send is reduce-scatter phase send and allgather phase first send, 183 | // thus we can send all chunk. 184 | if (first_send_q.empty() && wait_send_q.empty() && 185 | first_send_q_buffering.empty()) { 186 | TRACE(trace_issue_send_); 187 | 188 | auto mem = chunk_to_memory[i]; 189 | SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false); 190 | wait_send_completion_q.push(i); 191 | 192 | TRACE(trace_issue_send_); 193 | } else { 194 | first_send_q_buffering.push(i); 195 | } 196 | } 197 | 198 | // When first_send is not completed, We cannot issue first-send's recv. 199 | if (remaining_recv_q_length <= 2 && !reg_q.empty() && 200 | chunk_to_memory[reg_q.front()] == NULL) { 201 | TRACE(trace_issue_recv_); 202 | 203 | int recv_key = reg_q.front(); 204 | reg_q.pop(); 205 | 206 | auto range = ranges[recv_key]; 207 | size_t elements = (range.second - range.first); 208 | size_t bytes = elements * sizeof(T); 209 | 210 | auto mem = chunk_to_memory[recv_key] = controller.getMemory(); 211 | 212 | remaining_recv_q_length++; 213 | RecvRegistered(from_rank, mem->ptr(), mem->mr(), bytes, false); 214 | 215 | TRACE(trace_issue_recv_); 216 | } 217 | 218 | if (!first_send_q.empty()) { 219 | TRACE(trace_issue_copy_kernel_); 220 | 221 | int i = first_send_q.front(); 222 | first_send_q.pop(); 223 | 224 | auto range = ranges[i]; 225 | size_t offset_elements = range.first; 226 | size_t elements = (range.second - range.first); 227 | size_t bytes = elements * sizeof(T); 228 | 229 | auto mem = chunk_to_memory[i] = controller.getMemory(); 230 | CUDACHECK(cudaMemcpyAsync(mem->ptr(), sendbuf + offset_elements, bytes, 231 | cudaMemcpyDefault, mem->stream())); 232 | wait_send_q.push(i); 233 | 234 | TRACE(trace_issue_copy_kernel_); 235 | } 236 | 237 | if (!wait_send_q.empty() && 238 | cudaStreamQuery(chunk_to_memory[wait_send_q.front()]->stream()) == 239 | cudaSuccess) { 240 | TRACE(trace_issue_send_); 241 | 242 | int i = wait_send_q.front(); 243 | wait_send_q.pop(); 244 | 245 | auto range = ranges[i]; 246 | size_t elements = (range.second - range.first); 247 | size_t bytes = elements * sizeof(T); 248 | 249 | auto mem = chunk_to_memory[i]; 250 | 251 | SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false); 252 | 253 | wait_send_completion_q.push(i); 254 | 255 | TRACE(trace_issue_send_); 256 | } else if (first_send_q.empty() && wait_send_q.empty() && 257 | !first_send_q_buffering.empty()) { 258 | TRACE(trace_issue_send_); 259 | while (!first_send_q_buffering.empty()) { 260 | int i = first_send_q_buffering.front(); 261 | first_send_q_buffering.pop(); 262 | 263 | auto range = ranges[i]; 264 | size_t elements = (range.second - range.first); 265 | size_t bytes = elements * sizeof(T); 266 | 267 | auto mem = chunk_to_memory[i]; 268 | 269 | SendRegistered(to_rank, mem->ptr(), mem->mr(), bytes, false); 270 | wait_send_completion_q.push(i); 271 | } 272 | TRACE(trace_issue_send_); 273 | } 274 | 275 | while (SendPoll(to_rank)) { 276 | TRACE(trace_other_); 277 | 278 | int complete_send_chunk_id = wait_send_completion_q.front(); 279 | wait_send_completion_q.pop(); 280 | CUDACHECK(cudaStreamSynchronize( 281 | chunk_to_memory[complete_send_chunk_id]->stream())); 282 | controller.returnMemory(chunk_to_memory[complete_send_chunk_id]); 283 | chunk_to_memory[complete_send_chunk_id] = NULL; 284 | 285 | TRACE(trace_other_); 286 | } 287 | } 288 | 289 | TRACE(trace_other_); 290 | while (!wait_send_completion_q.empty()) { 291 | SendWait(to_rank); 292 | int complete_send_chunk_id = wait_send_completion_q.front(); 293 | wait_send_completion_q.pop(); 294 | // We need sync because memcpy is issued. 295 | CUDACHECK(cudaStreamSynchronize( 296 | chunk_to_memory[complete_send_chunk_id]->stream())); 297 | controller.returnMemory(chunk_to_memory[complete_send_chunk_id]); 298 | chunk_to_memory[complete_send_chunk_id] = NULL; 299 | } 300 | TRACE(trace_other_); 301 | } 302 | 303 | class Chunk { 304 | public: 305 | int range_id_; 306 | 307 | int depth_; 308 | 309 | // pair_rank_ is no meaning in some context. 310 | int pair_rank_; 311 | 312 | // reduce_ is no meaning in some context. 313 | bool reduce_; 314 | 315 | bool last_; 316 | 317 | Chunk(int range_id, int depth, int pair_rank = -1, bool reduce = false, 318 | bool last = false) 319 | : range_id_(range_id), 320 | depth_(depth), 321 | pair_rank_(pair_rank), 322 | reduce_(reduce), 323 | last_(last) {} 324 | }; 325 | 326 | template 327 | void IBVerbsCommunicator::AllreduceRabenseifnerCuda(const T* sendbuf, 328 | T* recvbuf, 329 | size_t len_elements) { 330 | TRACE(trace_other_); 331 | bool first_memcpy_done = false; 332 | CUDACHECK(cudaMemcpyAsync(recvbuf, sendbuf, sizeof(T) * len_elements, 333 | cudaMemcpyDefault)); 334 | 335 | if (world_size_ == 1) { 336 | CUDACHECK(cudaStreamSynchronize(0)); 337 | 338 | TRACE(trace_other_); 339 | return; 340 | } 341 | 342 | int world_size_exp = util::GetExpOfTwo(world_size_); 343 | 344 | // check world_size is power-of-2 or not 345 | if (world_size_exp == 0) { 346 | TRACE(trace_other_); 347 | util::IbcommError(__FILE__, __LINE__, 348 | util::IBCOMM_ERROR_CODE::NOT_SUPPORTED, 349 | "Currently, rabenseifner's algorithm doesn't support " 350 | "non-power-of-2 processes."); 351 | } 352 | 353 | auto ranges = SplitBuffer(len_elements, sizeof(T)); 354 | auto rank_to_chunk = GetRankToChunk(ranges); 355 | 356 | auto chunks = ranges.size(); 357 | 358 | auto controller = 359 | pool_->GetController((ranges[0].second - ranges[0].first) * sizeof(T)); 360 | std::vector chunk_to_memory(chunks, NULL); 361 | 362 | std::vector reduced_chunk(chunks, false); 363 | std::queue recv_q; 364 | std::queue wait_recv_q; 365 | std::queue send_q; 366 | std::queue wait_send_q; 367 | std::queue wait_send_copy_q; 368 | std::queue wait_reduction_q; 369 | std::queue first_send_q; 370 | std::queue first_send_q_buffering; 371 | 372 | // GPU working memory size check and realloc if need 373 | if (tmp_gpu_buffer_size_ < sizeof(T) * (ranges[0].second - ranges[0].first)) { 374 | // dealloc 375 | util::IbcommWarning( 376 | __FILE__, __LINE__, 377 | "IBCOMM_GPU_WORK_MEMORY_SIZE is smaller than chunk size.\n" 378 | "runtime-reallocation is occured."); 379 | CUDACHECK(cudaFree(tmp_gpu_buffer_)); 380 | 381 | // alloc 382 | tmp_gpu_buffer_size_ = sizeof(T) * (ranges[0].second - ranges[0].first); 383 | CUDACHECK(cudaMalloc(static_cast(&tmp_gpu_buffer_), 384 | tmp_gpu_buffer_size_)); 385 | } 386 | 387 | int start_rank = 0; 388 | int end_rank = world_size_; 389 | 390 | // Reduce-Scatter (recursive halving) 391 | for (int step = 0; step < world_size_exp; step++) { 392 | int to_rank = my_rank_ ^ (1 << step); 393 | 394 | int send_rank_start, send_rank_end, recv_rank_start, recv_rank_end; 395 | if (my_rank_ < to_rank) { 396 | // I send front rank 397 | send_rank_start = start_rank; 398 | send_rank_end = end_rank - (end_rank - start_rank) / 2; 399 | recv_rank_start = send_rank_end; 400 | recv_rank_end = end_rank; 401 | } else { 402 | // I send back rank 403 | recv_rank_start = start_rank; 404 | recv_rank_end = end_rank - (end_rank - start_rank) / 2; 405 | send_rank_start = recv_rank_end; 406 | send_rank_end = end_rank; 407 | } 408 | 409 | for (int recv_rank = recv_rank_start; recv_rank < recv_rank_end; 410 | recv_rank++) { 411 | for (auto chunk : rank_to_chunk[recv_rank]) { 412 | recv_q.emplace(chunk, step, to_rank, true); 413 | } 414 | } 415 | for (int send_rank = send_rank_start; send_rank < send_rank_end; 416 | send_rank++) { 417 | for (auto chunk : rank_to_chunk[send_rank]) { 418 | if (step == 0) { 419 | first_send_q.emplace(chunk, step, to_rank); 420 | } else { 421 | send_q.emplace(chunk, step, to_rank); 422 | } 423 | } 424 | } 425 | 426 | start_rank = recv_rank_start; 427 | end_rank = recv_rank_end; 428 | } 429 | 430 | // AllGather (recursive doubling) 431 | for (int step = 0; step < world_size_exp; step++) { 432 | int to_rank = my_rank_ ^ (1 << (world_size_exp - step - 1)); 433 | 434 | int send_rank_start, send_rank_end, recv_rank_start, recv_rank_end; 435 | if (my_rank_ > to_rank) { 436 | // I send front rank 437 | send_rank_start = start_rank; 438 | send_rank_end = end_rank; 439 | recv_rank_start = send_rank_end; 440 | recv_rank_end = recv_rank_start + end_rank - start_rank; 441 | } else { 442 | // I send back rank 443 | send_rank_start = start_rank; 444 | send_rank_end = end_rank; 445 | recv_rank_end = send_rank_start; 446 | recv_rank_start = recv_rank_end - (end_rank - start_rank); 447 | } 448 | 449 | for (int recv_rank = recv_rank_start; recv_rank < recv_rank_end; 450 | recv_rank++) { 451 | for (auto chunk : rank_to_chunk[recv_rank]) { 452 | recv_q.emplace(chunk, step + world_size_exp, to_rank); 453 | } 454 | } 455 | 456 | for (int send_rank = send_rank_start; send_rank < send_rank_end; 457 | send_rank++) { 458 | for (auto chunk : rank_to_chunk[send_rank]) { 459 | send_q.emplace(chunk, step + world_size_exp, to_rank, true, 460 | step == (world_size_exp - 1)); 461 | } 462 | } 463 | 464 | start_rank = std::min(send_rank_start, recv_rank_start); 465 | end_rank = std::max(send_rank_end, recv_rank_end); 466 | } 467 | 468 | TRACE(trace_other_); 469 | 470 | while (!recv_q.empty() || !wait_recv_q.empty() || !send_q.empty() || 471 | !wait_send_q.empty() || !wait_send_copy_q.empty() || 472 | !wait_reduction_q.empty() || !first_send_q.empty() || 473 | !first_send_q_buffering.empty()) { 474 | while (wait_reduction_q.empty() && !wait_recv_q.empty() && 475 | RecvPoll(wait_recv_q.front().pair_rank_)) { 476 | TRACE(trace_received_); 477 | 478 | auto received = wait_recv_q.front(); 479 | wait_recv_q.pop(); 480 | 481 | auto range = ranges[received.range_id_]; 482 | size_t offset_elements = range.first; 483 | size_t elements = (range.second - range.first); 484 | size_t bytes = elements * sizeof(T); 485 | 486 | auto& mem = chunk_to_memory[received.range_id_]; 487 | 488 | TRACE(trace_received_); 489 | 490 | if (received.reduce_) { 491 | // Reduce-Scatter phase 492 | 493 | if (!first_memcpy_done) { 494 | TRACE(trace_other_); 495 | CUDACHECK(cudaStreamSynchronize(0)); 496 | first_memcpy_done = true; 497 | TRACE(trace_other_); 498 | } 499 | 500 | TRACE(trace_issue_redu_kernel_); 501 | 502 | // tmp_gpu_buffer <- mem 503 | CUDACHECK(cudaMemcpyAsync(tmp_gpu_buffer_, mem->ptr(), bytes, 504 | cudaMemcpyDefault, mem->stream())); 505 | 506 | const auto blocks = 507 | std::min(util::ceilDiv(elements, (size_t)THREADS), (size_t)(65535)); 508 | 509 | // recvbuf += tmp_gpu_buffer ( on GPU ) 510 | _reduce_inplace_cuda<<stream()>>>( 511 | recvbuf + offset_elements, static_cast(tmp_gpu_buffer_), 512 | elements); 513 | 514 | // mem <- recvbuf 515 | CUDACHECK(cudaMemcpyAsync(mem->ptr(), recvbuf + offset_elements, bytes, 516 | cudaMemcpyDefault, mem->stream())); 517 | 518 | received.depth_++; 519 | wait_reduction_q.push(received); 520 | 521 | TRACE(trace_issue_redu_kernel_); 522 | } else { 523 | // AllGather phase 524 | 525 | TRACE(trace_issue_copy_kernel_); 526 | 527 | // recvbuf <- mem 528 | CUDACHECK(cudaMemcpyAsync(recvbuf + offset_elements, mem->ptr(), bytes, 529 | cudaMemcpyDefault, mem->stream())); 530 | 531 | reduced_chunk[received.range_id_] = true; 532 | 533 | TRACE(trace_issue_copy_kernel_); 534 | } 535 | } 536 | 537 | if (!wait_reduction_q.empty() && 538 | cudaStreamQuery( 539 | chunk_to_memory[wait_reduction_q.front().range_id_]->stream()) == 540 | cudaSuccess) { 541 | TRACE(trace_reduced_); 542 | 543 | auto reduced = wait_reduction_q.front(); 544 | wait_reduction_q.pop(); 545 | 546 | auto range = ranges[reduced.range_id_]; 547 | size_t elements = (range.second - range.first); 548 | size_t bytes = elements * sizeof(T); 549 | 550 | auto& mem = chunk_to_memory[reduced.range_id_]; 551 | 552 | TRACE(trace_reduced_); 553 | 554 | if (!send_q.empty() && send_q.front().range_id_ == reduced.range_id_ && 555 | send_q.front().depth_ == reduced.depth_) { 556 | auto send_range = send_q.front(); 557 | send_q.pop(); 558 | 559 | if (first_send_q.empty() && wait_send_copy_q.empty() && 560 | first_send_q_buffering.empty()) { 561 | TRACE(trace_issue_send_); 562 | 563 | SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes, 564 | false); 565 | wait_send_q.push(send_range); 566 | 567 | TRACE(trace_issue_send_); 568 | } else { 569 | first_send_q_buffering.push(send_range); 570 | } 571 | 572 | if (send_range.reduce_) { 573 | // AllGather phase 574 | reduced_chunk[send_range.range_id_] = true; 575 | } 576 | } else { 577 | CUDACHECK(cudaStreamSynchronize(mem->stream())); 578 | controller.returnMemory(mem); 579 | mem = NULL; 580 | } 581 | } 582 | 583 | if (wait_recv_q.size() <= 2 && !recv_q.empty() && 584 | chunk_to_memory[recv_q.front().range_id_] == NULL) { 585 | TRACE(trace_issue_recv_); 586 | 587 | auto recv_range = recv_q.front(); 588 | recv_q.pop(); 589 | 590 | auto range = ranges[recv_range.range_id_]; 591 | size_t elements = (range.second - range.first); 592 | size_t bytes = elements * sizeof(T); 593 | 594 | auto mem = chunk_to_memory[recv_range.range_id_] = controller.getMemory(); 595 | 596 | RecvRegistered(recv_range.pair_rank_, mem->ptr(), mem->mr(), bytes, 597 | false); 598 | wait_recv_q.push(recv_range); 599 | 600 | TRACE(trace_issue_recv_); 601 | } 602 | 603 | while (first_send_q.empty() && wait_send_copy_q.empty() && 604 | first_send_q_buffering.empty() && !send_q.empty() && 605 | reduced_chunk[send_q.front().range_id_]) { 606 | TRACE(trace_issue_send_); 607 | 608 | auto send_range = send_q.front(); 609 | send_q.pop(); 610 | 611 | auto range = ranges[send_range.range_id_]; 612 | size_t elements = (range.second - range.first); 613 | size_t bytes = elements * sizeof(T); 614 | 615 | auto mem = chunk_to_memory[send_range.range_id_]; 616 | 617 | SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes, 618 | false); 619 | wait_send_q.push(send_range); 620 | 621 | TRACE(trace_issue_send_); 622 | } 623 | 624 | if (!first_send_q.empty()) { 625 | TRACE(trace_issue_copy_kernel_); 626 | 627 | auto send_range = first_send_q.front(); 628 | first_send_q.pop(); 629 | 630 | auto range = ranges[send_range.range_id_]; 631 | size_t offset_elements = range.first; 632 | size_t elements = (range.second - range.first); 633 | size_t bytes = elements * sizeof(T); 634 | 635 | auto mem = chunk_to_memory[send_range.range_id_] = controller.getMemory(); 636 | CUDACHECK(cudaMemcpyAsync(mem->ptr(), sendbuf + offset_elements, bytes, 637 | cudaMemcpyDefault, mem->stream())); 638 | 639 | wait_send_copy_q.push(send_range); 640 | 641 | TRACE(trace_issue_copy_kernel_); 642 | } 643 | 644 | if (!wait_send_copy_q.empty() && 645 | cudaStreamQuery( 646 | chunk_to_memory[wait_send_copy_q.front().range_id_]->stream()) == 647 | cudaSuccess) { 648 | TRACE(trace_issue_send_); 649 | 650 | auto send_range = wait_send_copy_q.front(); 651 | wait_send_copy_q.pop(); 652 | 653 | auto range = ranges[send_range.range_id_]; 654 | size_t elements = (range.second - range.first); 655 | size_t bytes = elements * sizeof(T); 656 | 657 | auto mem = chunk_to_memory[send_range.range_id_]; 658 | 659 | SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes, 660 | false); 661 | wait_send_q.push(send_range); 662 | 663 | TRACE(trace_issue_send_); 664 | } 665 | 666 | while (first_send_q.empty() && wait_send_copy_q.empty() && 667 | !first_send_q_buffering.empty()) { 668 | TRACE(trace_issue_send_); 669 | 670 | auto send_range = first_send_q_buffering.front(); 671 | first_send_q_buffering.pop(); 672 | 673 | auto range = ranges[send_range.range_id_]; 674 | size_t elements = (range.second - range.first); 675 | size_t bytes = elements * sizeof(T); 676 | 677 | auto mem = chunk_to_memory[send_range.range_id_]; 678 | 679 | SendRegistered(send_range.pair_rank_, mem->ptr(), mem->mr(), bytes, 680 | false); 681 | wait_send_q.push(send_range); 682 | 683 | TRACE(trace_issue_send_); 684 | } 685 | 686 | while (!wait_send_q.empty() && SendPoll(wait_send_q.front().pair_rank_)) { 687 | TRACE(trace_other_); 688 | 689 | auto send_range = wait_send_q.front(); 690 | wait_send_q.pop(); 691 | 692 | if (send_range.reduce_ && !send_range.last_) { 693 | // AllGather phase and non-last AllGather send 694 | // We need to send a chunk which is already sent, 695 | // so we still hold data on CPU-memory. 696 | } else { 697 | auto& mem = chunk_to_memory[send_range.range_id_]; 698 | 699 | CUDACHECK(cudaStreamSynchronize(mem->stream())); 700 | controller.returnMemory(mem); 701 | mem = NULL; 702 | } 703 | 704 | TRACE(trace_other_); 705 | } 706 | } 707 | 708 | TRACE(trace_other_); 709 | for (auto& mem : chunk_to_memory) { 710 | if (mem != NULL) { 711 | CUDACHECK(cudaStreamSynchronize(mem->stream())); 712 | controller.returnMemory(mem); 713 | mem = NULL; 714 | } 715 | } 716 | TRACE(trace_other_); 717 | } 718 | 719 | #endif 720 | -------------------------------------------------------------------------------- /ibcomm/allreduce_tester.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | // allreduce_tester.cpp 3 | // 4 | // A helper program for allreduce integration test. 5 | // 6 | // Usage: 7 | // $ mpiexec -n ${NP} ./allreduce_tester [algorithm] [buffer size] [init expr] 8 | // [check expr] 9 | // 10 | // * [algorithm] : Name of Allreduce Algorithm. "ring" and 11 | // "rabenseifner" is supported. 12 | // 13 | // * [buffer size] : Size of the target buffer. Suffix "k", "m", "g" are 14 | // allowed. 15 | // ex.) 1024, 128M, 10k 16 | // * [init expr] : Target bufffer is initialized with this expression in 17 | // an elementwise manner. 18 | // For details of expressions, see 19 | // https://github.com/codeplea/tinyexpr Additional 20 | // variables are supported: 21 | // - p : Process rank 22 | // - np : Number of processes (e.g. mpi_size) 23 | // - n : Number of elementso of the target buffer 24 | // (NOT size in bytes) 25 | // - nb : Size of the target buffer in bytes 26 | // - i : Index of the element in buffer 27 | // ex.) 28 | // (1) init_expr = "1", n = 4, np = 2 29 | // Rank 0: [1, 1, 1, 1] 30 | // Rank 1: [1, 1, 1, 1] 31 | // (2) init_expr = "1/np*p+i", n = 4, np = 2 32 | // Rank 0: [0.0, 1.0, 2.0, 3.0] # [1/2*0+0, 33 | // 1/2*0+1, ...] Rank 1: [0.5, 1.5, 2.5, 3.5] # 34 | // [1/2*1+0, 1/2*1+1, ... ] 35 | // 36 | // * [check expr] : Target buffer is checked after Allreduce operation 37 | // using check expr. 38 | // The grammar of expressions is identical to [init 39 | // expr] 40 | // 41 | 42 | #include 43 | #include 44 | #include 45 | #include 46 | 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | 56 | #include "grumpi/grumpi.hpp" 57 | #include "ibcomm/ibverbs_communicator.h" 58 | #include "ibcomm/util.h" 59 | #include "tinyexpr/tinyexpr.h" 60 | 61 | class TinyExpr { 62 | std::vector vars_; 63 | te_expr *expr_; 64 | 65 | public: 66 | TinyExpr() : vars_(), expr_(nullptr) {} 67 | 68 | void set_variable(const char *name, const void *address, int type, 69 | void *context) { 70 | te_variable va; 71 | va.name = name; 72 | va.address = address; 73 | va.type = type; 74 | va.context = context; 75 | vars_.push_back(va); 76 | } 77 | 78 | void compile(const std::string &expr) { compile(expr.c_str()); } 79 | 80 | void compile(const char *expr) { 81 | int err; 82 | expr_ = 83 | te_compile(expr, vars_.data(), static_cast(vars_.size()), &err); 84 | 85 | if (!expr_) { 86 | std::stringstream ss; 87 | ss << "Invalid expression: '" << expr << "'"; 88 | throw std::runtime_error(ss.str()); 89 | } 90 | } 91 | 92 | double eval() { 93 | if (!expr_) { 94 | throw std::runtime_error("Expression must be compiled before eval()"); 95 | } 96 | return te_eval(expr_); 97 | } 98 | }; 99 | 100 | class Communicator { 101 | MPI_Comm mpi_comm_; 102 | int size_; 103 | int rank_; 104 | std::unique_ptr ibcomm_; 105 | 106 | public: 107 | explicit Communicator(MPI_Comm comm = MPI_COMM_WORLD) : mpi_comm_(comm) { 108 | MPI_Comm_rank(mpi_comm_, &rank_); 109 | MPI_Comm_size(mpi_comm_, &size_); 110 | ibcomm_.reset(new IBVerbsCommunicator(size_)); 111 | 112 | std::vector qps(size_ * 3); 113 | 114 | for (int i = 0; i < size_; i++) { 115 | if (i == rank_) { 116 | continue; 117 | } 118 | ProcessInfo pinfo = ibcomm_->CreateQueuePair(i); 119 | qps[i * 3 + 0] = pinfo.lid; 120 | qps[i * 3 + 1] = pinfo.qp_n; 121 | qps[i * 3 + 2] = pinfo.psn; 122 | } 123 | 124 | MPI_Alltoall(MPI_IN_PLACE, 3, MPI_UINT32_T, qps.data(), 3, MPI_UINT32_T, 125 | comm); 126 | 127 | for (int i = 0; i < size_; i++) { 128 | if (i == rank_) { 129 | ibcomm_->RegisterMyself(i); 130 | } else { 131 | ProcessInfo pinfo; 132 | pinfo.lid = qps[i * 3 + 0]; 133 | pinfo.qp_n = qps[i * 3 + 1]; 134 | pinfo.psn = qps[i * 3 + 2]; 135 | ibcomm_->RegisterQueuePair(i, pinfo); 136 | } 137 | } 138 | } 139 | 140 | void die(const std::string &errmsg, int retcode = 1) { 141 | if (rank_ == 0) { 142 | std::cerr << errmsg << std::endl; 143 | } 144 | exit(retcode); 145 | } 146 | 147 | template 148 | void allreduce(const std::string &algorithm_type, 149 | const thrust::device_vector &sendbuf_d, 150 | thrust::device_vector *recvbuf_d) { 151 | if (recvbuf_d == nullptr) 152 | util::IbcommError(__FILE__, __LINE__, 153 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 154 | "recvbuf_d is nullptr."); 155 | 156 | if (algorithm_type == "ring") { 157 | ibcomm_->AllreduceRingCuda(sendbuf_d.data().get(), 158 | recvbuf_d->data().get(), sendbuf_d.size()); 159 | } else if (algorithm_type == "rabenseifner") { 160 | ibcomm_->AllreduceRabenseifnerCuda( 161 | sendbuf_d.data().get(), recvbuf_d->data().get(), sendbuf_d.size()); 162 | } else { 163 | die("Error: Unsupported algorithm"); 164 | } 165 | } 166 | }; 167 | 168 | /** 169 | * Application main class 170 | */ 171 | template 172 | class AllreduceTester { 173 | public: 174 | using ElemType = T; 175 | 176 | private: 177 | MPI_Comm comm_; 178 | int mpi_rank_; 179 | int mpi_size_; 180 | 181 | Communicator ibcomm_; 182 | 183 | // target array size 184 | size_t array_nbytes_; // array size in bytes 185 | size_t num_elems_; // array length 186 | 187 | double var_p; // "p" variable in expressions (process rank) 188 | double var_np; // "np" variable in expressions (number of processes, i.e. 189 | // mpi_size) 190 | double var_n; // "n" variable in expressions (number of elements in buffer) 191 | double var_nb; // "nb" variable in expressions (size of buffer in bytes) 192 | 193 | std::unique_ptr init_expr_; 194 | std::unique_ptr check_expr_; 195 | 196 | void usage(int argc, char **argv) { 197 | if (mpi_rank_ == 0) { 198 | std::cerr << "Usage: " << argv[0] << " " 199 | << "[algorithm] " 200 | << "[array size (Bytes)] " 201 | << "[init expr] " 202 | << "[check expr]" << std::endl; 203 | } 204 | exit(-1); 205 | } 206 | 207 | void die(const std::string &errmsg, int retcode = 1) { 208 | if (mpi_rank_ == 0) { 209 | std::cerr << errmsg << std::endl; 210 | } 211 | exit(retcode); 212 | } 213 | 214 | size_t parse_nbytes(const char *src) { 215 | size_t i = 0; 216 | size_t n = 0; 217 | 218 | while (std::isdigit(src[i])) { 219 | n = n * 10 + (src[i] - '0'); 220 | i++; 221 | } 222 | 223 | if (src[i] != 0) { 224 | switch (src[i]) { 225 | case 'k': 226 | case 'K': 227 | n *= 1024; 228 | break; 229 | case 'm': 230 | case 'M': 231 | n *= 1024 * 1024; 232 | break; 233 | case 'g': 234 | case 'G': 235 | n *= 1024 * 1024 * 1024; 236 | break; 237 | default: 238 | std::stringstream ss; 239 | ss << "Cannot parse an array size: '" << src << "'" << std::endl; 240 | die(ss.str()); 241 | } 242 | } 243 | 244 | i++; 245 | 246 | return n; 247 | } 248 | 249 | std::tuple parse_args( 250 | int argc, char **argv) { 251 | if (argc != 5) { 252 | usage(argc, argv); 253 | } 254 | // Parse argument 1 255 | std::string algorithm = argv[1]; 256 | 257 | // Parse argument 2 (array length (bytes)) 258 | size_t nbytes = parse_nbytes(argv[2]); 259 | 260 | // Parse argument 3 (initializing expression) 261 | std::string init_expr = argv[3]; 262 | 263 | std::string check_expr = argv[4]; 264 | 265 | return std::make_tuple(algorithm, nbytes, init_expr, check_expr); 266 | } 267 | 268 | void setup_sendbuf(thrust::host_vector *buf, 269 | const std::string &init_expr_str) { 270 | if (buf == nullptr) 271 | util::IbcommError(__FILE__, __LINE__, 272 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 273 | "buf is nullptr."); 274 | 275 | TinyExpr expr; 276 | 277 | expr.set_variable("p", &var_p, TE_VARIABLE, nullptr); 278 | expr.set_variable("np", &var_np, TE_VARIABLE, nullptr); 279 | expr.set_variable("n", &var_n, TE_VARIABLE, nullptr); 280 | expr.set_variable("nb", &var_nb, TE_VARIABLE, nullptr); 281 | 282 | for (size_t i = 0; i < buf->size(); i++) { 283 | double var_i = i; 284 | expr.set_variable("i", &var_i, TE_VARIABLE, nullptr); 285 | expr.compile(init_expr_str); 286 | (*buf)[i] = expr.eval(); 287 | } 288 | } 289 | 290 | std::tuple> check_recvbuf( 291 | const thrust::host_vector &buf, const std::string &check_expr_str) { 292 | constexpr double eps = 1e-12; 293 | 294 | std::vector msgs; 295 | bool check_ok = true; 296 | 297 | TinyExpr expr; 298 | 299 | expr.set_variable("p", &var_p, TE_VARIABLE, nullptr); 300 | expr.set_variable("np", &var_np, TE_VARIABLE, nullptr); 301 | expr.set_variable("n", &var_n, TE_VARIABLE, nullptr); 302 | expr.set_variable("nb", &var_nb, TE_VARIABLE, nullptr); 303 | 304 | for (size_t i = 0; i < buf.size(); i++) { 305 | double var_i = i; 306 | expr.set_variable("i", &var_i, TE_VARIABLE, nullptr); 307 | expr.compile(check_expr_str); 308 | 309 | double res = buf[i]; 310 | double ans = expr.eval(); 311 | 312 | if (std::abs(res - ans) > eps) { 313 | std::stringstream ss; 314 | ss << "Error: Element [" << i << "] must be " << ans 315 | << " but actually is " << res; 316 | msgs.push_back(ss.str()); 317 | check_ok = false; 318 | } 319 | } 320 | 321 | return std::make_tuple(check_ok, msgs); 322 | } 323 | 324 | void report_errors(bool check_ok, const std::vector &msgs) { 325 | for (int i = 0; i < mpi_size_; i++) { 326 | if (i == mpi_rank_) { 327 | if (!check_ok) { 328 | size_t report_num = std::min(msgs.size(), (size_t)1000); 329 | for (size_t i = 0; i < report_num; i++) { 330 | std::cerr << "[Rank " << mpi_rank_ << "] " << msgs[i] << std::endl; 331 | } 332 | } 333 | } 334 | MPI_Barrier(MPI_COMM_WORLD); 335 | } 336 | } 337 | 338 | thrust::host_vector run_allreduce( 339 | const std::string &algorithm_type, 340 | const thrust::host_vector &sendbuf) { 341 | thrust::host_vector recvbuf(num_elems_); 342 | thrust::device_vector recvbuf_d(num_elems_); 343 | thrust::device_vector sendbuf_d(num_elems_); 344 | 345 | sendbuf_d = sendbuf; 346 | 347 | ibcomm_.allreduce(algorithm_type, sendbuf_d, &recvbuf_d); 348 | 349 | recvbuf = recvbuf_d; 350 | return recvbuf; 351 | } 352 | 353 | public: 354 | explicit AllreduceTester(MPI_Comm comm) : comm_(comm), ibcomm_(comm_) { 355 | MPI_Comm_size(comm_, &mpi_size_); 356 | MPI_Comm_rank(comm_, &mpi_rank_); 357 | } 358 | 359 | // Check allreduce 360 | int run(int argc, char **argv) { 361 | std::string algorithm_type, init_expr_str, check_expr_str; 362 | std::tie(algorithm_type, array_nbytes_, init_expr_str, check_expr_str) = 363 | parse_args(argc, argv); 364 | 365 | std::vector supported_algorithms = {"ring", "rabenseifner"}; 366 | 367 | if (std::find(supported_algorithms.begin(), supported_algorithms.end(), 368 | algorithm_type) == supported_algorithms.end()) { 369 | std::stringstream ss; 370 | ss << "Error: Unsupported algorithm " << algorithm_type << std::endl; 371 | 372 | ss << "Supported algorithms: "; 373 | for (auto algo : supported_algorithms) { 374 | ss << algo << ", "; 375 | } 376 | die(ss.str()); 377 | } 378 | 379 | if (array_nbytes_ < sizeof(ElemType)) { 380 | if (mpi_rank_ == 0) { 381 | std::cerr << "Warning: specified array size is " 382 | << "smaller than the element size(" << sizeof(ElemType) 383 | << "). " 384 | << "Ceiling it up to " << sizeof(ElemType) << " [bytes]" 385 | << std::endl; 386 | } 387 | array_nbytes_ = sizeof(ElemType); 388 | } 389 | 390 | num_elems_ = array_nbytes_ / sizeof(ElemType); 391 | var_np = mpi_size_; 392 | var_p = mpi_rank_; 393 | var_n = array_nbytes_ / sizeof(ElemType); 394 | var_nb = array_nbytes_; 395 | 396 | thrust::host_vector sendbuf(num_elems_); 397 | setup_sendbuf(&sendbuf, init_expr_str); 398 | 399 | auto recvbuf = run_allreduce(algorithm_type, sendbuf); 400 | 401 | bool check_ok; 402 | std::vector msgs; 403 | std::tie(check_ok, msgs) = check_recvbuf(recvbuf, check_expr_str); 404 | 405 | report_errors(check_ok, msgs); 406 | 407 | int status_all = (check_ok ? 0 : 1); 408 | MPI_Allreduce(&status_all, &status_all, 1, MPI_INT, MPI_SUM, 409 | MPI_COMM_WORLD); 410 | 411 | return status_all; 412 | } 413 | }; 414 | 415 | int main(int argc, char **argv) { 416 | using ElemType = int; 417 | 418 | MPI_Init(&argc, &argv); 419 | 420 | int ngpus = -1; 421 | CUDACHECK(cudaGetDeviceCount(&ngpus)); 422 | 423 | int intra_rank; 424 | grumpi::Comm_local_rank(MPI_COMM_WORLD, &intra_rank); 425 | 426 | CUDACHECK(cudaSetDevice(intra_rank % ngpus)); 427 | 428 | AllreduceTester tester(MPI_COMM_WORLD); 429 | 430 | int status = tester.run(argc, argv); 431 | 432 | MPI_Finalize(); 433 | 434 | return status; 435 | } 436 | -------------------------------------------------------------------------------- /ibcomm/ibverbs_communicator.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #include "ibcomm/ibverbs_communicator.h" 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "ibcomm/util.h" 18 | 19 | #ifdef USE_CUDA 20 | #include 21 | #include 22 | #endif 23 | 24 | IBVerbsCommunicator::IBVerbsCommunicator() {} 25 | IBVerbsCommunicator::IBVerbsCommunicator(int world_size) { Init(world_size); } 26 | 27 | void IBVerbsCommunicator::Init(int world_size) { 28 | if (initialized_) { 29 | util::IbcommWarning(__FILE__, __LINE__, 30 | "IBVerbsCommunicator is already initialized."); 31 | return; 32 | } 33 | 34 | int ret = ibv_fork_init(); 35 | if (ret) { 36 | int errno_backup = errno; 37 | util::IbcommWarning(__FILE__, __LINE__, "Failure: ibv_fork_init (errno=%d)", 38 | errno_backup); 39 | } 40 | 41 | int devices; 42 | dev_list_ = ibv_get_device_list(&devices); 43 | 44 | if (!dev_list_) { 45 | int errno_backup = errno; 46 | util::IbcommError(__FILE__, __LINE__, 47 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 48 | "Failure: ibv_get_device_list (errno=%d)", errno_backup); 49 | } 50 | 51 | for (int i = 0; i < devices; i++) { 52 | ibv_device* device = dev_list_[i]; 53 | 54 | if (!device) { 55 | continue; 56 | } 57 | 58 | context_ = ibv_open_device(device); 59 | 60 | if (!context_) { 61 | continue; 62 | } 63 | } 64 | 65 | if (!context_) { 66 | util::IbcommError(__FILE__, __LINE__, 67 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 68 | "Failure: No HCA can use"); 69 | } 70 | 71 | ret = ibv_query_port(context_, 1, &port_attr_); 72 | 73 | if (ret != 0 || port_attr_.lid == 0) { 74 | // error handling 75 | util::IbcommError(__FILE__, __LINE__, 76 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 77 | "Failure: ibv_query_port"); 78 | } 79 | 80 | pd_ = ibv_alloc_pd(context_); 81 | 82 | if (!pd_) { 83 | util::IbcommError(__FILE__, __LINE__, 84 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 85 | "Failure: ibv_alloc_pd"); 86 | } 87 | 88 | world_size_ = world_size; 89 | pq_world_ = std::vector(world_size_); 90 | psn_world_ = std::vector(world_size_); 91 | mr_world_ = std::vector>( 92 | world_size_, std::pair(NULL, NULL)); 93 | 94 | #ifdef USE_CUDA 95 | PrepareMemoryPool(); 96 | #endif 97 | 98 | initialized_ = true; 99 | } 100 | 101 | IBVerbsCommunicator::~IBVerbsCommunicator() { 102 | // release queues 103 | for (size_t i = 0; i < pq_world_.size(); i++) { 104 | if (pq_world_[i].queue_pair != NULL) 105 | ibv_destroy_qp(pq_world_[i].queue_pair); 106 | 107 | if (pq_world_[i].recv_complete_queue != NULL) 108 | ibv_destroy_cq(pq_world_[i].recv_complete_queue); 109 | 110 | if (pq_world_[i].send_complete_queue != NULL) 111 | ibv_destroy_cq(pq_world_[i].send_complete_queue); 112 | } 113 | 114 | // release memory region which is nonblocking-io but not freed. 115 | for (size_t i = 0; i < mr_world_.size(); i++) { 116 | // send 117 | if (mr_world_[i].first != NULL) { 118 | ibv_dereg_mr(mr_world_[i].first); 119 | mr_world_[i].first = NULL; 120 | } 121 | 122 | // recv 123 | if (mr_world_[i].second != NULL) { 124 | ibv_dereg_mr(mr_world_[i].second); 125 | mr_world_[i].second = NULL; 126 | } 127 | } 128 | 129 | #ifdef USE_CUDA 130 | pool_.reset(); 131 | #endif 132 | 133 | if (pd_ != NULL) { 134 | ibv_dealloc_pd(pd_); 135 | } 136 | 137 | if (context_ != NULL) { 138 | ibv_close_device(context_); 139 | } 140 | 141 | if (dev_list_ != NULL) { 142 | ibv_free_device_list(dev_list_); 143 | } 144 | 145 | #ifdef USE_TRACE 146 | DumpTrace(); 147 | #endif 148 | 149 | #ifdef USE_CUDA 150 | if (tmp_gpu_buffer_ != NULL) { 151 | CUDACHECK(cudaFree(tmp_gpu_buffer_)); 152 | tmp_gpu_buffer_ = NULL; 153 | } 154 | #endif 155 | } 156 | 157 | namespace { 158 | double timeDiffMillis(const struct timespec& t1, const struct timespec& t2) { 159 | return (t2.tv_sec - t1.tv_sec) * 1e3 + (t2.tv_nsec - t1.tv_nsec) * 1e-6; 160 | } 161 | 162 | void DumpTraceFromVector(std::ofstream& stream, struct timespec origin, 163 | const std::vector& vector) { 164 | for (int i = 0; i < vector.size(); i += 2) { 165 | stream << timeDiffMillis(origin, vector[i]) << ","; 166 | stream << timeDiffMillis(vector[i], vector[i + 1]) << ","; 167 | } 168 | } 169 | } // namespace 170 | 171 | void IBVerbsCommunicator::DumpTrace() const { 172 | std::stringstream ss; 173 | const char* base = getenv("IBCOMM_TRACE_FILE"); 174 | base = base ? base : "ibcomm_trace"; 175 | ss << base << "_" << my_rank_ << ".dat"; 176 | std::ofstream trace_log; 177 | trace_log.open(ss.str().c_str()); 178 | 179 | if (!trace_log.good()) { 180 | std::cerr << "ERROR: ofstream open failed" << std::endl; 181 | } else { 182 | trace_log << std::scientific; 183 | 184 | trace_log << "received,"; 185 | DumpTraceFromVector(trace_log, trace_start_, trace_received_); 186 | trace_log << std::endl; 187 | 188 | trace_log << "reduced,"; 189 | DumpTraceFromVector(trace_log, trace_start_, trace_reduced_); 190 | trace_log << std::endl; 191 | 192 | trace_log << "issue-send,"; 193 | DumpTraceFromVector(trace_log, trace_start_, trace_issue_send_); 194 | trace_log << std::endl; 195 | 196 | trace_log << "issue-copy-kernel,"; 197 | DumpTraceFromVector(trace_log, trace_start_, trace_issue_copy_kernel_); 198 | trace_log << std::endl; 199 | 200 | trace_log << "issue-redu-kernel,"; 201 | DumpTraceFromVector(trace_log, trace_start_, trace_issue_redu_kernel_); 202 | trace_log << std::endl; 203 | 204 | trace_log << "issue-recv,"; 205 | DumpTraceFromVector(trace_log, trace_start_, trace_issue_recv_); 206 | trace_log << std::endl; 207 | 208 | trace_log << "other,"; 209 | DumpTraceFromVector(trace_log, trace_start_, trace_other_); 210 | trace_log << std::endl; 211 | } 212 | } 213 | 214 | void IBVerbsCommunicator::SetTimerBase() { 215 | clock_gettime(CLOCK_MONOTONIC_RAW, &trace_start_); 216 | } 217 | 218 | namespace { 219 | void modify_qp(struct ibv_qp* qp, uint32_t src_psn, uint16_t dest_lid, 220 | uint32_t dest_pqn, uint32_t dest_psn) { 221 | int ret; 222 | 223 | struct ibv_qp_attr init_attr = {}; 224 | init_attr.qp_state = IBV_QPS_INIT; 225 | init_attr.port_num = 1; 226 | init_attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE; 227 | 228 | ret = ibv_modify_qp( 229 | qp, &init_attr, 230 | IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); 231 | if (ret != 0) { 232 | util::IbcommError(__FILE__, __LINE__, 233 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 234 | "Failure: ibv_modify_qp(1)"); 235 | } 236 | 237 | struct ibv_qp_attr rtr_attr = {}; 238 | rtr_attr.qp_state = IBV_QPS_RTR; 239 | rtr_attr.path_mtu = IBV_MTU_4096; 240 | rtr_attr.dest_qp_num = dest_pqn; 241 | rtr_attr.rq_psn = dest_psn; 242 | rtr_attr.max_dest_rd_atomic = 0; 243 | 244 | // retry_speed faster 245 | rtr_attr.min_rnr_timer = 1; 246 | 247 | // retry_speed slower 248 | // rtr_attr.min_rnr_timer = 0; 249 | 250 | rtr_attr.ah_attr.is_global = 0; 251 | rtr_attr.ah_attr.dlid = dest_lid; 252 | rtr_attr.ah_attr.sl = 0; 253 | rtr_attr.ah_attr.src_path_bits = 0; 254 | rtr_attr.ah_attr.port_num = 1; 255 | 256 | ret = ibv_modify_qp(qp, &rtr_attr, 257 | IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | 258 | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | 259 | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); 260 | if (ret != 0) { 261 | util::IbcommError(__FILE__, __LINE__, 262 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 263 | "Failure: ibv_modify_qp(2)"); 264 | } 265 | 266 | struct ibv_qp_attr rts_attr = {}; 267 | rts_attr.qp_state = IBV_QPS_RTS; 268 | rts_attr.timeout = 1; 269 | rts_attr.retry_cnt = 7; 270 | rts_attr.rnr_retry = 7; 271 | rts_attr.sq_psn = src_psn; 272 | rts_attr.max_rd_atomic = 0; 273 | 274 | ret = ibv_modify_qp(qp, &rts_attr, 275 | IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | 276 | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | 277 | IBV_QP_MAX_QP_RD_ATOMIC); 278 | if (ret != 0) { 279 | util::IbcommError(__FILE__, __LINE__, 280 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 281 | "Failure: ibv_modify_qp(3)"); 282 | } 283 | } 284 | } // namespace 285 | 286 | struct ProcessInfo IBVerbsCommunicator::RegisterProcess( 287 | int dest_rank, struct ProcessInfo pinfo) { 288 | struct ProcessInfo my_pinfo = CreateQueuePair(dest_rank); 289 | 290 | modify_qp(pq_world_[dest_rank].queue_pair, psn_world_[dest_rank], pinfo.lid, 291 | pinfo.qp_n, pinfo.psn); 292 | 293 | return my_pinfo; 294 | } 295 | 296 | struct ProcessInfo IBVerbsCommunicator::CreateQueuePair(int dest_rank) { 297 | ibv_cq *send_complete_queue, *recv_complete_queue; 298 | 299 | send_complete_queue = ibv_create_cq(context_, 1024 * 1024, NULL, NULL, 0); 300 | recv_complete_queue = ibv_create_cq(context_, 1024 * 1024, NULL, NULL, 0); 301 | 302 | if (!send_complete_queue) { 303 | util::IbcommError(__FILE__, __LINE__, 304 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 305 | "Failure: ibv_create_cq of send cq"); 306 | } 307 | 308 | if (!recv_complete_queue) { 309 | util::IbcommError(__FILE__, __LINE__, 310 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 311 | "Failure: ibv_create_cq of recv cq"); 312 | } 313 | 314 | uint32_t my_psn = random() % 0xFFFFFF; 315 | psn_world_[dest_rank] = my_psn; 316 | 317 | struct ibv_qp_init_attr qp_init_attr = {}; 318 | qp_init_attr.qp_type = IBV_QPT_RC; 319 | qp_init_attr.send_cq = send_complete_queue; 320 | qp_init_attr.recv_cq = recv_complete_queue; 321 | qp_init_attr.cap.max_send_wr = 8192; 322 | qp_init_attr.cap.max_recv_wr = 8192; 323 | qp_init_attr.cap.max_send_sge = 1; 324 | qp_init_attr.cap.max_recv_sge = 1; 325 | qp_init_attr.sq_sig_all = 1; 326 | 327 | struct ibv_qp* queue_pair; 328 | queue_pair = ibv_create_qp(pd_, &qp_init_attr); 329 | 330 | if (!queue_pair) { 331 | util::IbcommError(__FILE__, __LINE__, 332 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 333 | "Failure: ibv_create_cq"); 334 | } 335 | 336 | pq_world_[dest_rank] = 337 | ProcessQueue(send_complete_queue, recv_complete_queue, queue_pair); 338 | 339 | struct ProcessInfo my_pinfo = {}; 340 | my_pinfo.lid = port_attr_.lid; 341 | my_pinfo.psn = my_psn; 342 | my_pinfo.qp_n = queue_pair->qp_num; 343 | 344 | return my_pinfo; 345 | } 346 | 347 | void IBVerbsCommunicator::RegisterQueuePair(int dest_rank, 348 | struct ProcessInfo pinfo) { 349 | const auto& pqueue = pq_world_[dest_rank]; 350 | 351 | modify_qp(pqueue.queue_pair, psn_world_[dest_rank], pinfo.lid, pinfo.qp_n, 352 | pinfo.psn); 353 | } 354 | 355 | void IBVerbsCommunicator::RegisterMyself(int my_rank) { 356 | this->my_rank_ = my_rank; 357 | } 358 | 359 | struct ibv_mr* IBVerbsCommunicator::RegisterSendBuf(const void* buf, 360 | size_t len) { 361 | struct ibv_mr* mr_buf = ibv_reg_mr(pd_, const_cast(buf), len, 0); 362 | if (mr_buf == 0) { 363 | util::IbcommError(__FILE__, __LINE__, 364 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 365 | "Failure: ibv_reg_mr on send"); 366 | } 367 | 368 | return mr_buf; 369 | } 370 | 371 | void IBVerbsCommunicator::Send(int dest_rank, const void* buf, size_t len, 372 | bool blocking) { 373 | auto& save = mr_world_[dest_rank].first; 374 | 375 | if (save != NULL) { 376 | util::IbcommError(__FILE__, __LINE__, 377 | util::IBCOMM_ERROR_CODE::NOT_SUPPORTED, 378 | "SendWait must be called before next non-blocking send."); 379 | } 380 | 381 | save = RegisterSendBuf(buf, len); 382 | 383 | SendRegistered(dest_rank, buf, save, len, blocking); 384 | } 385 | 386 | void IBVerbsCommunicator::SendRegistered(int dest_rank, const void* buf, 387 | struct ibv_mr* mr_buf, size_t len, 388 | bool blocking) { 389 | int ret; 390 | struct ibv_sge sge = {}; 391 | sge.addr = (uint64_t)(uintptr_t)buf; 392 | sge.length = len; 393 | sge.lkey = mr_buf->lkey; 394 | 395 | struct ibv_send_wr send_wr = {}; 396 | send_wr.wr_id = (uint64_t)(uintptr_t)buf; 397 | send_wr.sg_list = &sge; 398 | send_wr.num_sge = 1; 399 | send_wr.opcode = IBV_WR_SEND; 400 | 401 | const auto& pq = pq_world_[dest_rank]; 402 | 403 | struct ibv_send_wr* bad_wr; 404 | ret = ibv_post_send(pq.queue_pair, &send_wr, &bad_wr); 405 | if (ret != 0) { 406 | util::IbcommError(__FILE__, __LINE__, 407 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 408 | "Failure: ibv_post_send"); 409 | } 410 | 411 | if (blocking) { 412 | SendWait(dest_rank); 413 | } 414 | } 415 | 416 | bool IBVerbsCommunicator::SendPoll(int dest_rank) { 417 | int ret; 418 | const auto& pq = pq_world_[dest_rank]; 419 | struct ibv_wc wc = {}; 420 | bool ok = false; 421 | 422 | ret = ibv_poll_cq(pq.send_complete_queue, 1, &wc); 423 | if (ret == 0) return false; 424 | 425 | if (ret < 0) { 426 | util::IbcommError(__FILE__, __LINE__, 427 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 428 | "Failure: ibv_poll_cq"); 429 | } 430 | 431 | if (wc.status != IBV_WC_SUCCESS && wc.status == IBV_WC_LOC_PROT_ERR) { 432 | util::IbcommError(__FILE__, __LINE__, 433 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 434 | "Failure: send completion error %d", wc.status); 435 | } 436 | 437 | switch (wc.opcode) { 438 | case IBV_WC_SEND: 439 | ok = true; 440 | 441 | break; 442 | 443 | default: 444 | util::IbcommError(__FILE__, __LINE__, 445 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 446 | "Failure: SendPoll %d", wc.opcode); 447 | } 448 | 449 | if (ok) { 450 | // unregister memory region from non-blocking io wait list 451 | auto& save = mr_world_[dest_rank].first; 452 | if (save != NULL) { 453 | ibv_dereg_mr(save); 454 | save = NULL; 455 | } 456 | 457 | return true; 458 | } 459 | 460 | return false; 461 | } 462 | 463 | void IBVerbsCommunicator::SendWait(int dest_rank) { 464 | while (!SendPoll(dest_rank)) { 465 | } 466 | 467 | // unregister memory region from non-blocking io wait list 468 | auto& save = mr_world_[dest_rank].first; 469 | if (save != NULL) { 470 | ibv_dereg_mr(save); 471 | save = NULL; 472 | } 473 | } 474 | 475 | struct ibv_mr* IBVerbsCommunicator::RegisterRecvBuf(void* buf, size_t len) { 476 | struct ibv_mr* mr_buf = ibv_reg_mr(pd_, buf, len, IBV_ACCESS_LOCAL_WRITE); 477 | if (mr_buf == 0) { 478 | util::IbcommError(__FILE__, __LINE__, 479 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 480 | "Failure: ibv_reg_mr on recv"); 481 | } 482 | 483 | return mr_buf; 484 | } 485 | 486 | void IBVerbsCommunicator::Recv(int src_rank, void* buf, size_t len, 487 | bool blocking) { 488 | auto& save = mr_world_[src_rank].second; 489 | 490 | if (save != NULL) { 491 | util::IbcommError(__FILE__, __LINE__, 492 | util::IBCOMM_ERROR_CODE::NOT_SUPPORTED, 493 | "RecvWait must be called before next non-blocking send."); 494 | } 495 | 496 | save = RegisterRecvBuf(buf, len); 497 | 498 | RecvRegistered(src_rank, buf, save, len, blocking); 499 | } 500 | 501 | void IBVerbsCommunicator::RecvRegistered(int src_rank, const void* buf, 502 | struct ibv_mr* mr_buf, size_t len, 503 | bool blocking) { 504 | struct ibv_sge sge = {}; 505 | sge.addr = (uint64_t)(uintptr_t)buf; 506 | sge.length = len; 507 | sge.lkey = mr_buf->lkey; 508 | 509 | struct ibv_recv_wr recv_wr = {}; 510 | recv_wr.wr_id = (uint64_t)(uintptr_t)buf; 511 | recv_wr.sg_list = &sge; 512 | recv_wr.num_sge = 1; 513 | 514 | const auto& pq = pq_world_[src_rank]; 515 | 516 | struct ibv_recv_wr* bad_wr; 517 | int ret = ibv_post_recv(pq.queue_pair, &recv_wr, &bad_wr); 518 | if (ret != 0) { 519 | util::IbcommError(__FILE__, __LINE__, 520 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 521 | "Failure: ibv_post_recv"); 522 | } 523 | 524 | if (blocking) { 525 | RecvWait(src_rank); 526 | } 527 | } 528 | 529 | bool IBVerbsCommunicator::RecvPoll(int src_rank) { 530 | int ret; 531 | const auto& pq = pq_world_[src_rank]; 532 | struct ibv_wc wc = {}; 533 | bool ok = false; 534 | 535 | ret = ibv_poll_cq(pq.recv_complete_queue, 1, &wc); 536 | if (ret == 0) return false; 537 | 538 | if (ret < 0) { 539 | util::IbcommError(__FILE__, __LINE__, 540 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 541 | "Failure: ibv_poll_cq"); 542 | } 543 | 544 | if (wc.status != IBV_WC_SUCCESS && wc.status == IBV_WC_LOC_PROT_ERR) { 545 | util::IbcommError(__FILE__, __LINE__, 546 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 547 | "Failure: recv completion error %d", wc.status); 548 | } 549 | 550 | switch (wc.opcode) { 551 | case IBV_WC_RECV: 552 | ok = true; 553 | 554 | break; 555 | 556 | default: 557 | util::IbcommError(__FILE__, __LINE__, 558 | util::IBCOMM_ERROR_CODE::IBVERBS_ERROR, 559 | "Failure: RecvPoll %d", wc.opcode); 560 | } 561 | 562 | if (ok) { 563 | // unregister memory region from non-blocking io wait list 564 | auto& save = mr_world_[src_rank].second; 565 | if (save != NULL) { 566 | ibv_dereg_mr(save); 567 | save = NULL; 568 | } 569 | return true; 570 | } 571 | return false; 572 | } 573 | 574 | void IBVerbsCommunicator::RecvWait(int src_rank) { 575 | while (!RecvPoll(src_rank)) { 576 | } 577 | 578 | // unregister memory region from non-blocking io wait list 579 | auto& save = mr_world_[src_rank].second; 580 | if (save != NULL) { 581 | ibv_dereg_mr(save); 582 | save = NULL; 583 | } 584 | } 585 | 586 | void IBVerbsCommunicator::Bcast(void* buf, size_t len, int root) { 587 | // This function provides naive Bcast; 588 | 589 | if (my_rank_ == root) { 590 | // Bcast root 591 | for (size_t i = 0; i < pq_world_.size(); i++) { 592 | if (static_cast(i) == my_rank_) continue; 593 | 594 | Send(i, buf, len, false); 595 | } 596 | 597 | for (size_t i = 0; i < pq_world_.size(); i++) { 598 | if (static_cast(i) == my_rank_) continue; 599 | 600 | SendWait(i); 601 | } 602 | } else { 603 | // Bcast non-root 604 | Recv(root, buf, len); 605 | } 606 | } 607 | 608 | void IBVerbsCommunicator::PopMrAndDereg(std::queue* q) { 609 | if (q == nullptr) 610 | util::IbcommError(__FILE__, __LINE__, 611 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 612 | "q is nullptr."); 613 | 614 | ibv_dereg_mr(q->front()); 615 | q->pop(); 616 | } 617 | 618 | namespace { 619 | int ReadChunkSize() { 620 | const char* size = getenv("IBCOMM_CHUNKSIZE"); 621 | 622 | if (size != NULL) { 623 | int size_int = atoi(size); 624 | 625 | if (size_int <= 0) 626 | util::IbcommError(__FILE__, __LINE__, 627 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 628 | "IBCOMM_CHUNKSIZE must be greater than 1"); 629 | 630 | return size_int; 631 | } 632 | 633 | return -1; // use default size 634 | } 635 | }; // namespace 636 | 637 | std::vector> IBVerbsCommunicator::SplitBuffer( 638 | size_t len_elements, size_t sizeof_element) { 639 | int chunks; 640 | size_t elements_per_chunk; 641 | 642 | if (len_elements < world_size_) { 643 | util::IbcommError( 644 | __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::NOT_SUPPORTED, 645 | "Input vector is too short for current Allreduce algorithm.\n" 646 | "Incrase the number of the input vector to be larger than number of " 647 | "processes.\n"); 648 | } 649 | 650 | int env_chunk_bytes = ReadChunkSize(); 651 | if (env_chunk_bytes != -1) { 652 | // chunk_size is selected manually. 653 | if (env_chunk_bytes % sizeof_element != 0) { 654 | util::IbcommError( 655 | __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 656 | "Selected `IBCOMM_CHUNKSIZE` is not divisible by `sizeof(T)`."); 657 | } 658 | 659 | elements_per_chunk = env_chunk_bytes / sizeof_element; 660 | chunks = util::ceilDiv((size_t)len_elements, (size_t)elements_per_chunk); 661 | 662 | if (chunks < 2 * world_size_) { 663 | util::IbcommError(__FILE__, __LINE__, 664 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 665 | "Selected `IBCOMM_CHUNKSIZE` is too large.\n" 666 | "Satisfy 2 * `world_size` <= `allreduce_bufsize` / " 667 | "`IBCOMM_CHUNKSIZE`."); 668 | } 669 | } else { 670 | chunks = 4 * world_size_; 671 | elements_per_chunk = util::ceilDiv((size_t)len_elements, (size_t)chunks); 672 | } 673 | 674 | std::vector> ranges; 675 | for (auto i = 0; i < chunks; i++) { 676 | int start_index = elements_per_chunk * i; 677 | int end_index = std::min(len_elements, elements_per_chunk * (i + 1)); 678 | 679 | if (start_index < end_index) ranges.emplace_back(start_index, end_index); 680 | } 681 | 682 | return ranges; 683 | } 684 | 685 | std::vector> IBVerbsCommunicator::GetRankToChunk( 686 | const std::vector>& ranges) { 687 | std::vector> rank_to_chunk(world_size_); 688 | size_t chunks_per_rank = ranges.size() / world_size_; 689 | size_t chunks_per_rank_remainer = ranges.size() % world_size_; 690 | int chunk_id = 0; 691 | 692 | for (int i = 0; i < world_size_; i++) { 693 | for (int j = 0; j < chunks_per_rank; j++) { 694 | rank_to_chunk[i].push_back(chunk_id); 695 | chunk_id++; 696 | } 697 | 698 | if (i < chunks_per_rank_remainer) { 699 | rank_to_chunk[i].push_back(chunk_id); 700 | chunk_id++; 701 | } 702 | } 703 | 704 | assert(chunk_id == ranges.size()); 705 | 706 | return rank_to_chunk; 707 | } 708 | -------------------------------------------------------------------------------- /ibcomm/ibverbs_communicator.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #ifdef USE_CUDA 15 | #include 16 | #include 17 | 18 | #include "ibcomm/memory_pool.h" 19 | 20 | template 21 | class MemoryPool; 22 | #endif 23 | 24 | struct ProcessInfo { 25 | uint16_t lid; 26 | uint32_t qp_n; 27 | uint32_t psn; 28 | }; 29 | 30 | struct ProcessQueue { 31 | struct ibv_cq* send_complete_queue; 32 | struct ibv_cq* recv_complete_queue; 33 | struct ibv_qp* queue_pair; 34 | 35 | ProcessQueue() { 36 | send_complete_queue = NULL; 37 | recv_complete_queue = NULL; 38 | queue_pair = NULL; 39 | } 40 | 41 | ProcessQueue(struct ibv_cq* scq, struct ibv_cq* rcq, struct ibv_qp* qp) { 42 | send_complete_queue = scq; 43 | recv_complete_queue = rcq; 44 | queue_pair = qp; 45 | } 46 | 47 | // copy 48 | ProcessQueue(const ProcessQueue&) = delete; 49 | ProcessQueue& operator=(const ProcessQueue&) = delete; 50 | 51 | // move 52 | // queues' are managed by IBVerbsCommunicator. 53 | ProcessQueue(ProcessQueue&&) noexcept = default; 54 | ProcessQueue& operator=(ProcessQueue&&) noexcept = default; 55 | }; 56 | 57 | class Memory; 58 | 59 | class IBVerbsCommunicator { 60 | // to export registerSendBuf, registerRecvBuf 61 | friend class MemoryBlock; 62 | 63 | public: 64 | // ctor 65 | IBVerbsCommunicator(); 66 | explicit IBVerbsCommunicator(int world_size); 67 | 68 | // Manages infiniband-related resources thus we need to delete copy and move 69 | // ctors. copy 70 | IBVerbsCommunicator(const IBVerbsCommunicator&) noexcept = delete; 71 | IBVerbsCommunicator& operator=(const IBVerbsCommunicator&) noexcept = delete; 72 | 73 | // move 74 | IBVerbsCommunicator(IBVerbsCommunicator&&) noexcept = delete; 75 | IBVerbsCommunicator& operator=(IBVerbsCommunicator&&) noexcept = delete; 76 | 77 | // dtor 78 | ~IBVerbsCommunicator(); 79 | 80 | // init 81 | void Init(int world_size); 82 | 83 | // connection management 84 | struct ProcessInfo RegisterProcess(int dest_rank, struct ProcessInfo pinfo); 85 | struct ProcessInfo CreateQueuePair(int dest_rank); 86 | void RegisterQueuePair(int dest_rank, struct ProcessInfo pinfo); 87 | void RegisterMyself(int my_rank); 88 | 89 | // send 90 | void Send(int dest_rank, const void* buf, size_t len, bool blocking = true); 91 | 92 | // recv 93 | void Recv(int src_rank, void* buf, size_t len, bool blocking = true); 94 | 95 | // wait ( for non-blocking io ) 96 | bool SendPoll(int dest_rank); 97 | bool RecvPoll(int src_rank); 98 | void SendWait(int dest_rank); 99 | void RecvWait(int src_rank); 100 | 101 | // allreduce 102 | template 103 | void AllreduceRing(const T* sendbuf, T* recvbuf, size_t len_elements); 104 | 105 | template 106 | void AllreduceRabenseifner(const T* sendbuf, T* recvbuf, size_t len_elements); 107 | 108 | #ifdef USE_CUDA 109 | template 110 | void AllreduceRingCuda(const T* sendbuf, T* recvbuf, size_t len_elements); 111 | 112 | template 113 | void AllreduceRabenseifnerCuda(const T* sendbuf, T* recvbuf, 114 | size_t len_elements); 115 | 116 | void PrepareMemoryPool(); 117 | #endif 118 | 119 | // bcast 120 | void Bcast(void* buf, size_t len, int root); 121 | 122 | void SetTimerBase(); 123 | void DumpTrace() const; 124 | 125 | private: 126 | bool initialized_ = false; 127 | struct ibv_port_attr port_attr_ = {}; 128 | std::vector pq_world_; 129 | std::vector psn_world_; 130 | std::vector> mr_world_; 131 | 132 | struct ibv_mr* RegisterSendBuf(const void* buf, size_t len); 133 | void SendRegistered(int dest_rank, const void* buf, struct ibv_mr* mr_buf, 134 | size_t len, bool blocking = true); 135 | 136 | struct ibv_mr* RegisterRecvBuf(void* buf, size_t len); 137 | void RecvRegistered(int src_rank, const void* buf, struct ibv_mr* mr_buf, 138 | size_t len, bool blocking = true); 139 | 140 | void PopMrAndDereg(std::queue* q); 141 | 142 | #ifdef USE_CUDA 143 | std::unique_ptr> pool_; 144 | 145 | void* tmp_gpu_buffer_ = NULL; 146 | size_t tmp_gpu_buffer_size_ = 0; 147 | #endif 148 | 149 | // need destruction variables 150 | struct ibv_device** dev_list_ = NULL; 151 | struct ibv_context* context_ = NULL; 152 | 153 | // Protection Domain 154 | struct ibv_pd* pd_ = NULL; 155 | 156 | // local communication 157 | int my_rank_ = -1; 158 | size_t world_size_; 159 | 160 | // allreduce range func 161 | // Splits buffer based given chunk size 162 | std::vector> SplitBuffer(size_t len_elements, 163 | size_t len_per_element); 164 | // Defines map (rank |-> chunk_ids) 165 | std::vector> GetRankToChunk( 166 | const std::vector>& ranges); 167 | 168 | struct timespec trace_start_; 169 | 170 | // receive is completed 171 | std::vector trace_received_; 172 | 173 | // reduction is completed 174 | std::vector trace_reduced_; 175 | 176 | // issue send 177 | std::vector trace_issue_send_; 178 | 179 | // issue copy-kernel call 180 | std::vector trace_issue_copy_kernel_; 181 | 182 | // issue reduce-kernel call 183 | std::vector trace_issue_redu_kernel_; 184 | 185 | // issue recv 186 | std::vector trace_issue_recv_; 187 | 188 | // others 189 | std::vector trace_other_; 190 | }; 191 | 192 | #include "ibcomm/allreduce_cpu_impl.h" 193 | #include "ibcomm/allreduce_cuda_impl.h" 194 | -------------------------------------------------------------------------------- /ibcomm/ibverbs_communicator_cuda.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #ifdef USE_CUDA 4 | #include "ibcomm/ibverbs_communicator.h" 5 | 6 | #include 7 | #include 8 | 9 | #include "ibcomm/memory_pool.h" 10 | 11 | namespace { 12 | int ReadWorkGpuMemorySize() { 13 | const char* size = getenv("IBCOMM_WORK_GPU_MEMORY_SIZE"); 14 | 15 | if (size != NULL) { 16 | int size_int = atoi(size); 17 | 18 | return size_int; 19 | } 20 | 21 | return -1; // use default size 22 | } 23 | 24 | }; // namespace 25 | 26 | void IBVerbsCommunicator::PrepareMemoryPool() { 27 | pool_.reset(new MemoryPool(this)); 28 | 29 | tmp_gpu_buffer_size_ = ReadWorkGpuMemorySize(); 30 | 31 | if (tmp_gpu_buffer_size_ == -1) { 32 | tmp_gpu_buffer_size_ = 32 * 1024 * 1024; 33 | } 34 | 35 | CUDACHECK( 36 | cudaMalloc(static_cast(&tmp_gpu_buffer_), tmp_gpu_buffer_size_)); 37 | } 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /ibcomm/memory_pool.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #include "ibcomm/memory_pool.h" 4 | 5 | #include "ibcomm/ibverbs_communicator.h" 6 | #include "ibcomm/util.h" 7 | 8 | #ifdef USE_CUDA 9 | 10 | // ~~~ Memory class ~~~ // 11 | Memory::Memory(MemoryBlock* block, size_t offset) 12 | : block_(*block), offset_(offset) { 13 | if (block == nullptr) 14 | util::IbcommError(__FILE__, __LINE__, 15 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 16 | "block is nullptr."); 17 | } 18 | 19 | Memory* Memory::SetStream(cudaStream_t stream) { 20 | stream_ = stream; 21 | return this; 22 | } 23 | 24 | Memory* Memory::UnsetStream() { return SetStream(NULL); } 25 | 26 | // ~~~ MemoryBlock class ~~~ // 27 | MemoryBlock::MemoryBlock(size_t size, IBVerbsCommunicator* comm) 28 | : comm_(*comm), length_(size) { 29 | if (comm == nullptr) 30 | util::IbcommError(__FILE__, __LINE__, 31 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 32 | "comm is nullptr."); 33 | 34 | // not thread safe 35 | CUDACHECK(cudaHostAlloc(&ptr_, length_, cudaHostAllocDefault)); 36 | mr_ = comm->RegisterRecvBuf(ptr_, length_); 37 | } 38 | 39 | MemoryBlock::~MemoryBlock() { 40 | ibv_dereg_mr(mr_); 41 | CUDACHECK(cudaFreeHost(ptr_)); 42 | } 43 | 44 | ConstantMemoryAllocator::ConstantMemoryAllocator(size_t initial_size, 45 | IBVerbsCommunicator* comm) 46 | : size_(initial_size), comm_(*comm) { 47 | if (comm == nullptr) 48 | util::IbcommError(__FILE__, __LINE__, 49 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 50 | "comm is nullptr."); 51 | } 52 | 53 | std::unique_ptr ConstantMemoryAllocator::Allocate() { 54 | return std::unique_ptr(new MemoryBlock(size_, &comm_)); 55 | } 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /ibcomm/memory_pool.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #pragma once 4 | 5 | #ifdef USE_CUDA 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "ibcomm/util.h" 15 | 16 | class IBVerbsCommunicator; 17 | 18 | /* Concept: 19 | * MemoryPool : manages (raw-) `MemoryBlock`. unit of `ibv_reg_mr`. 20 | * MemoryController : manages `Memory`. unit of `cudaStream`. 21 | * MemoryAllocator : allocates `MemoryBlock`. 22 | * MemoryAllocator(size_t initial_size, IBVerbsCommunicator& comm); 23 | * std::unique_ptr Allocate(); 24 | * MemoryBlock : memory. 25 | * Memory : chunk. 26 | */ 27 | 28 | class MemoryBlock { 29 | public: 30 | // ctor 31 | MemoryBlock(size_t size, IBVerbsCommunicator* comm); 32 | 33 | // Manages raw pointers thus we need to delete copy and move ctors. 34 | // copy 35 | MemoryBlock(const MemoryBlock&) noexcept = delete; 36 | MemoryBlock& operator=(const MemoryBlock&) noexcept = delete; 37 | 38 | // move 39 | MemoryBlock(MemoryBlock&&) noexcept = delete; 40 | MemoryBlock& operator=(MemoryBlock&&) noexcept = delete; 41 | 42 | ~MemoryBlock(); 43 | 44 | inline void* ptr() { return ptr_; } 45 | inline size_t length() const { return length_; } 46 | inline struct ibv_mr* mr() { return mr_; } 47 | 48 | private: 49 | IBVerbsCommunicator& comm_; 50 | 51 | void* ptr_; 52 | size_t length_; 53 | struct ibv_mr* mr_; 54 | }; 55 | 56 | class Memory { 57 | public: 58 | Memory(MemoryBlock* block, size_t offset); 59 | 60 | inline void* ptr() { 61 | return static_cast(static_cast(block_.ptr()) + offset_); 62 | } 63 | inline cudaStream_t stream() { return stream_; } 64 | inline struct ibv_mr* mr() { return block_.mr(); } 65 | Memory* SetStream(cudaStream_t stream); 66 | Memory* UnsetStream(); 67 | 68 | private: 69 | MemoryBlock& block_; 70 | size_t offset_; 71 | cudaStream_t stream_; 72 | }; 73 | 74 | template 75 | class MemoryController; 76 | 77 | template 78 | class MemoryPool { 79 | friend class MemoryController; 80 | 81 | public: 82 | static constexpr int DefaultMaxNumCudaStream = 128; 83 | static constexpr int DefaultPreAllocSize = 64 * 1024 * 1024; // 64 MB. 84 | 85 | // ctor 86 | explicit MemoryPool(IBVerbsCommunicator* comm) 87 | : comm_(*comm), 88 | cuda_streams_(ReadNumCudaStream()), 89 | allocator_(ReadPreAllocSize(), comm) { 90 | if (comm == nullptr) 91 | util::IbcommError(__FILE__, __LINE__, 92 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 93 | "comm is nullptr."); 94 | 95 | for (auto& stream : cuda_streams_) { 96 | CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 97 | } 98 | 99 | Allocate(); 100 | } 101 | 102 | // Manages cudaStream_t thus we need to delete copy and move ctors. 103 | // copy 104 | MemoryPool(const MemoryPool&) = delete; 105 | MemoryPool& operator=(const MemoryPool&) = delete; 106 | 107 | // move 108 | MemoryPool(MemoryPool&&) = delete; 109 | MemoryPool& operator=(MemoryPool&&) = delete; 110 | 111 | MemoryController GetController(size_t chunk_size) { 112 | if (controller_in_use_) { 113 | util::IbcommError(__FILE__, __LINE__, 114 | util::IBCOMM_ERROR_CODE::NOT_SUPPORTED, 115 | "Currently, MemoryController is in use."); 116 | } 117 | 118 | controller_in_use_ = true; 119 | return MemoryController(this, chunk_size, cuda_streams_, 120 | memory_blocks_); 121 | } 122 | 123 | ~MemoryPool() { 124 | for (auto& stream : cuda_streams_) { 125 | cudaStreamDestroy(stream); 126 | } 127 | } 128 | 129 | private: 130 | IBVerbsCommunicator& comm_; 131 | 132 | MemoryAllocator allocator_; 133 | std::vector> memory_blocks_; 134 | std::vector cuda_streams_; 135 | bool controller_in_use_ = false; 136 | 137 | // Read NumCudaStream from environmental variable. 138 | // Returns the default size `DefaultMaxNumCudaStream` if it is not set. 139 | int ReadNumCudaStream() { 140 | const char* envvar = getenv("IBCOMM_NUM_CUDA_STREAM"); 141 | if (envvar) { 142 | int n = atoi(envvar); 143 | 144 | if (n <= 0 || n > 1024) { 145 | util::IbcommError( 146 | __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 147 | "Invalid value for IBCOMM_NUM_CUDA_STREAM: %s", envvar); 148 | } 149 | 150 | return n; 151 | } else { 152 | return DefaultMaxNumCudaStream; 153 | } 154 | } 155 | 156 | // Read PreAllocateSize from environmental variable. 157 | // Returns the default size `DefaultPreAllocSize` if it is not set. 158 | int ReadPreAllocSize() { 159 | const char* envvar = getenv("IBCOMM_MEMORY_POOL_PRE_ALLOC"); 160 | if (envvar) { 161 | int n = atoi(envvar); 162 | 163 | if (n < 4 || n > 1 * 1024 * 1024 * 1024) { 164 | util::IbcommError( 165 | __FILE__, __LINE__, util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 166 | "Invalid value for IBCOMM_MEMORY_POOL_PRE_ALLOC: %s", envvar); 167 | } 168 | 169 | return n; 170 | } else { 171 | return DefaultPreAllocSize; 172 | } 173 | } 174 | 175 | void CompleteMemoryController() { controller_in_use_ = false; } 176 | 177 | std::unique_ptr& Allocate() { 178 | memory_blocks_.push_back(allocator_.Allocate()); 179 | return memory_blocks_.back(); 180 | } 181 | 182 | cudaStream_t AddCudaStream() { 183 | cudaStream_t stream; 184 | CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 185 | cuda_streams_.push_back(stream); 186 | 187 | return stream; 188 | } 189 | }; 190 | 191 | template 192 | class MemoryController { 193 | public: 194 | MemoryController(MemoryPool* pool, size_t chunk_size, 195 | const std::vector& streams, 196 | const std::vector>& blocks) 197 | : pool_(*pool), 198 | chunk_size_(chunk_size), 199 | streams_(streams), 200 | blocks_(blocks) { 201 | if (pool == nullptr) 202 | util::IbcommError(__FILE__, __LINE__, 203 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 204 | "pool is nullptr."); 205 | 206 | for (auto stream : streams_) vacant_streams_.push(stream); 207 | 208 | for (const auto& block : blocks_) { 209 | AddMemoryBlockToVacantMemories(block.get()); 210 | } 211 | } 212 | 213 | // Manages Memory thus we need to delete copy ctors. 214 | // copy 215 | MemoryController(const MemoryController&) = delete; 216 | MemoryController& operator=(const MemoryController&) = delete; 217 | 218 | // move 219 | MemoryController(MemoryController&&) = default; 220 | MemoryController& operator=(MemoryController&&) = default; 221 | 222 | Memory* getMemory() { 223 | if (vacant_streams_.empty()) { 224 | vacant_streams_.push(pool_.AddCudaStream()); 225 | } 226 | cudaStream_t stream = vacant_streams_.front(); 227 | vacant_streams_.pop(); 228 | 229 | if (vacant_memories_.empty()) { 230 | AddMemoryBlockToVacantMemories(pool_.Allocate().get()); 231 | } 232 | Memory* memory = vacant_memories_.front(); 233 | vacant_memories_.pop(); 234 | 235 | return memory->SetStream(stream); 236 | } 237 | 238 | void returnMemory(Memory* memory) { 239 | auto stream = memory->stream(); 240 | vacant_memories_.push(memory->UnsetStream()); 241 | vacant_streams_.push(stream); 242 | } 243 | 244 | ~MemoryController() { 245 | while (!vacant_memories_.empty()) { 246 | auto memory = vacant_memories_.front(); 247 | vacant_memories_.pop(); 248 | 249 | delete memory; 250 | } 251 | pool_.CompleteMemoryController(); 252 | } 253 | 254 | private: 255 | MemoryPool& pool_; 256 | size_t chunk_size_; 257 | const std::vector>& blocks_; 258 | const std::vector& streams_; 259 | 260 | std::queue vacant_streams_; 261 | std::queue vacant_memories_; 262 | 263 | void AddMemoryBlockToVacantMemories(MemoryBlock* block) { 264 | if (block == nullptr) 265 | util::IbcommError(__FILE__, __LINE__, 266 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 267 | "block is nullptr."); 268 | 269 | for (int i = 0; (i + 1) * chunk_size_ < block->length(); i++) { 270 | vacant_memories_.push(new Memory(block, i * chunk_size_)); 271 | } 272 | } 273 | }; 274 | 275 | class ConstantMemoryAllocator { 276 | public: 277 | ConstantMemoryAllocator(size_t initial_size, IBVerbsCommunicator* comm); 278 | std::unique_ptr Allocate(); 279 | 280 | private: 281 | IBVerbsCommunicator& comm_; 282 | size_t size_; 283 | }; 284 | 285 | #endif 286 | -------------------------------------------------------------------------------- /ibcomm/util.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #ifdef USE_CUDA 19 | #define CUDACHECK(cmd) \ 20 | do { \ 21 | cudaError_t e = cmd; \ 22 | if (e != cudaSuccess) { \ 23 | util::IbcommError(__FILE__, __LINE__, \ 24 | util::IBCOMM_ERROR_CODE::CUDA_ERROR, \ 25 | cudaGetErrorString(e)); \ 26 | } \ 27 | \ 28 | } while (0) 29 | #endif 30 | 31 | namespace util { 32 | enum class IBCOMM_ERROR_CODE : int { 33 | INVALID_ARGUMENT = 1, 34 | 35 | // Error occured in InfiniBand Verbs call. 36 | IBVERBS_ERROR = 2, 37 | 38 | // Error occured in CUDA call. 39 | CUDA_ERROR = 3, 40 | 41 | NOT_SUPPORTED = 4 42 | }; 43 | 44 | template 45 | void IbcommError(const char* filename, int line, IBCOMM_ERROR_CODE error_code, 46 | const char* format, Args const&... args) { 47 | fprintf(stderr, "Error occured at %s:L%d.\n", filename, line); 48 | fprintf(stderr, format, args...); 49 | fputs("", stderr); 50 | 51 | exit(static_cast(error_code)); 52 | } 53 | 54 | template 55 | void IbcommWarning(const char* filename, int line, const char* format, 56 | Args const&... args) { 57 | fprintf(stderr, "Warning occured at %s:L%d.\n", filename, line); 58 | fprintf(stderr, format, args...); 59 | fputs("", stderr); 60 | } 61 | 62 | inline void trace(std::vector* v) { 63 | if (v == nullptr) 64 | util::IbcommError(__FILE__, __LINE__, 65 | util::IBCOMM_ERROR_CODE::INVALID_ARGUMENT, 66 | "v is nullptr."); 67 | struct timespec ts; 68 | clock_gettime(CLOCK_MONOTONIC_RAW, &ts); 69 | v->push_back(ts); 70 | } 71 | 72 | class MalformedNumber : public std::runtime_error { 73 | public: 74 | explicit MalformedNumber(const std::string& ss) 75 | : std::runtime_error(ss.c_str()) {} 76 | }; 77 | 78 | // Parse a string and get a buffer size or chunk size. SI prefix is supported. 79 | // If any error occurs, success is set to false and the error message is 80 | // assigned to msg. 81 | inline int64_t parse_number(const char* str) { 82 | std::string n; 83 | int64_t multiply = 1; 84 | int pos = 0; 85 | const int len = strlen(str); 86 | 87 | if (str[0] == '+' || str[0] == '-') { 88 | // accept '-' for now to detect value range error. 89 | n += str[0]; 90 | pos = 1; 91 | } 92 | 93 | while (isdigit(str[pos]) && pos < len) { 94 | n += str[pos]; 95 | pos++; 96 | } 97 | 98 | if (n.size() == 0) { 99 | // there seems no number 100 | std::stringstream ss; 101 | ss << "Illegal number format prefix in '" << str << "'"; 102 | throw MalformedNumber(ss.str()); 103 | } 104 | 105 | if (pos < len) { 106 | // parse SI prefix 107 | switch (str[pos]) { 108 | case 'k': 109 | case 'K': 110 | multiply = 1024ul; 111 | pos++; 112 | break; 113 | case 'm': 114 | case 'M': 115 | multiply = 1024ul * 1024; 116 | pos++; 117 | break; 118 | case 'g': 119 | case 'G': 120 | multiply = 1024ul * 1024 * 1024; 121 | pos++; 122 | break; 123 | // default: 124 | // { 125 | // std::stringstream ss; 126 | // ss << "Illegal SI prefix in '" << str << "'"; 127 | // throw MalformedNumber(ss.str()); 128 | // } 129 | } 130 | } 131 | 132 | if (pos < len) { 133 | // Last 'b' or 'B' (bytes) is optional. Other characters are not allowed. 134 | if (!(str[pos] == 'b' || str[pos] == 'B')) { 135 | std::stringstream ss; 136 | ss << "Illegal SI prefix in '" << str << "'"; 137 | throw MalformedNumber(ss.str()); 138 | } 139 | pos++; 140 | } 141 | if (pos < len) { 142 | std::stringstream ss; 143 | ss << "Illegal number format prefix in '" << str << "'"; 144 | throw MalformedNumber(ss.str()); 145 | } 146 | 147 | int64_t n2 = atol(n.c_str()); 148 | return n2 * multiply; 149 | } 150 | 151 | template 152 | inline T ceilDiv(T v1, T v2) { 153 | return v1 % v2 ? v1 / v2 + 1 : v1 / v2; 154 | } 155 | 156 | inline int GetExpOfTwo(int n) { 157 | int p = 0; 158 | 159 | while (n != 0) { 160 | if (n % 2 == 1) { 161 | if (n == 1) 162 | return p; 163 | else 164 | return 0; 165 | } 166 | 167 | p++; 168 | 169 | n >>= 1; 170 | } 171 | 172 | return 0; 173 | 174 | // returns p (2^p == n) 175 | } 176 | }; // namespace util 177 | -------------------------------------------------------------------------------- /mpinvcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Dirty hack :) 4 | if echo "$*" | grep -qE -- '-DUSE_CUDA' ; then 5 | NVCC=1 6 | fi 7 | 8 | if [ "$NVCC" == 1 ]; then 9 | export OMPI_CXX=nvcc 10 | export MPICH_CXX=nvcc 11 | nvcc_arch_flag="-arch sm_52" 12 | 13 | if echo "$*" | grep -qE "(.cpp|.cxx|.cc)$" ; then 14 | xflag="-x cu" 15 | else 16 | xflag= 17 | fi 18 | else 19 | nvcc_arch_flag= 20 | if [ -n "$CXX" ]; then 21 | export OMPI_CXX=${CXX} 22 | export MPICH_CXX=${CXX} 23 | fi 24 | fi 25 | 26 | CMD=$(mpicxx -show "$@" "$xflag" "$nvcc_arch_flag") 27 | 28 | if [ "$NVCC" == 1 ]; then 29 | CMD=$(echo $CMD | sed -e "s/-Wl,/-Xlinker /g") 30 | CMD=$(echo $CMD | sed -e "s/\(-W[^ ][^ ]*\)/-Xcompiler \\1/g") 31 | CMD=$(echo $CMD | sed -e "s/\\(-pthread\\)/-Xcompiler \\1/g") 32 | CMD=$(echo $CMD | sed -e "s/\\(-rdynamic\\)/-Xcompiler \\1/g") 33 | CMD=$(echo $CMD | sed -e "s/\\(-fPIC\\)/-Xcompiler \\1/g") 34 | fi 35 | 36 | echo $CMD 37 | $CMD 38 | -------------------------------------------------------------------------------- /tests/allreduce_test.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen 2 | from typing import List 3 | from typing import Optional 4 | from typing import Union 5 | import itertools 6 | import os 7 | import os.path 8 | import pytest 9 | import unittest 10 | import sys 11 | 12 | 13 | IBCOMM_INVALID_ARGUMENT = 1 14 | IBCOMM_IBVERBS_ERROR = 2 15 | IBCOMM_CUDA_ERROR = 3 16 | IBCOMM_NOT_SUPPORTED = 4 17 | 18 | ALGO_RING = "ring" 19 | ALGO_RABEN = "rabenseifner" 20 | 21 | IBCOMM_ALGORITHMS = [ALGO_RING, ALGO_RABEN] 22 | 23 | 24 | def find_file(directory: str, fname: str) -> Optional[str]: 25 | """Find a file in directory. Used to find the allreduce_tester binary.""" 26 | for root, dirs, files in os.walk(directory): 27 | if fname in files: 28 | return os.path.join(root, fname) 29 | 30 | # not found 31 | return None 32 | 33 | 34 | def flatten1(lst: List[List]) -> List: 35 | return [item for sublist in lst for item in sublist] 36 | 37 | 38 | def dict_to_envs(envs: dict) -> List[str]: 39 | """ 40 | Expand a dict to command line arguments for Open MPI's '-x' option 41 | e.g.) {'FOO': 100} 42 | ==> 43 | ['-x', 'FOO=100'] 44 | """ 45 | z = itertools.zip_longest([], 46 | ["{}={}".format(k,v) for k, v in envs.items()], 47 | fillvalue='-x') 48 | 49 | return flatten1(list(z)) 50 | 51 | 52 | # Find the project directory 53 | ProjectDir = os.path.join(os.path.dirname(__file__), os.pardir) 54 | Tester = find_file(ProjectDir, 'allreduce_tester') 55 | 56 | if not os.path.exists(Tester): 57 | sys.stderr.write("Please build 'allreduce_tester' before running unit tests.\n") 58 | exit(1) 59 | 60 | 61 | class AllreduceTest(unittest.TestCase): 62 | @staticmethod 63 | def check(np: Union[int, str], 64 | algo: str, 65 | buffsize: Union[int, str], 66 | init_expr: str = "i*np+p", 67 | check_expr: str = "i*np*np+np*(np-1)/2", 68 | chunksize: Optional[Union[str, int]] = None): 69 | env = {} 70 | if chunksize is not None: 71 | env['IBCOMM_CHUNKSIZE'] = chunksize 72 | 73 | if 'NODEFILE' in os.environ: 74 | hostfile = os.environ['NODEFILE'] 75 | elif 'PBS_NODEFILE' in os.environ: 76 | hostfile = os.environ['PBS_NODEFILE'] 77 | else: 78 | hostfile = None 79 | 80 | assert algo in IBCOMM_ALGORITHMS, "{} is not support Allreduce algorithm.".format(algo) 81 | 82 | env_args = dict_to_envs(env) 83 | np = str(np) 84 | buffsize = str(buffsize) 85 | 86 | if hostfile is not None: 87 | hostfile = ['--hostfile', hostfile] 88 | else: 89 | hostfile = [] 90 | 91 | cmd = ['timeout', '90s', 'mpiexec', '-np', np, *hostfile, *env_args, Tester, algo, buffsize, init_expr, check_expr] 92 | 93 | print() 94 | print(' '.join(cmd)) 95 | p = Popen(cmd) 96 | out,err = p.communicate() 97 | return p.returncode 98 | 99 | def setUp(self): 100 | pass 101 | 102 | def test_1proc(self): 103 | # Allreduce works with just 1 process. 104 | ret = AllreduceTest.check(algo=ALGO_RING, np=1, buffsize=1024) 105 | assert ret == 0 106 | 107 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=1, buffsize=1024) 108 | assert ret == 0 109 | 110 | def test_small_buffer(self): 111 | # Tests Ring-AllReduce 112 | int_size = 4 113 | 114 | # Allreduce works with small buffer size 115 | NP=1 116 | ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size) 117 | assert ret == 0 118 | 119 | NP=2 120 | ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size) 121 | assert ret == 0 122 | 123 | NP=3 124 | ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size) 125 | assert ret == 0 126 | 127 | NP=5 128 | ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=NP * 2 * int_size) 129 | assert ret == 0 130 | 131 | NP=2 132 | # Relatively larger prime 133 | ret = AllreduceTest.check(algo=ALGO_RING, np=NP, buffsize=2521 * int_size) 134 | assert ret == 0 135 | 136 | # Tests Rabenseifner's algorithm 137 | NP=1 138 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size) 139 | assert ret == 0 140 | 141 | NP=2 142 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size) 143 | assert ret == 0 144 | 145 | NP=3 146 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size) 147 | assert ret == IBCOMM_NOT_SUPPORTED # Currently, non-power-of-2 np is not supported. 148 | 149 | NP=5 150 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=NP * 2 * int_size) 151 | assert ret == IBCOMM_NOT_SUPPORTED # Currently, non-power-of-2 np is not supported. 152 | 153 | NP=2 154 | # Relatively larger prime 155 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=NP, buffsize=2521 * int_size) 156 | assert ret == 0 157 | 158 | def test_basic(self): 159 | # Relatively larger buffer size and default chunksize 160 | # Tests Ring-AllReduce 161 | ret = AllreduceTest.check(algo=ALGO_RING, np=2, buffsize="128M") 162 | assert ret == 0 163 | 164 | ret = AllreduceTest.check(algo=ALGO_RING, np=3, buffsize="128M") 165 | assert ret == 0 166 | 167 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="128M") 168 | assert ret == 0 169 | 170 | # Test Rabenseifner's algorithm 171 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=2, buffsize="128M") 172 | assert ret == 0 173 | 174 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=3, buffsize="128M") 175 | assert ret == IBCOMM_NOT_SUPPORTED # Currently, non-power-of-2 np is not supported. 176 | 177 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=4, buffsize="128M") 178 | assert ret == 0 179 | 180 | def test_chunk_size(self): 181 | # Tests Ring-AllReduce 182 | # for a buffer size 1024 and NP 2, change the IBCOMM_CHUNKSIZE from [4 to 128] 183 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='4') 184 | assert ret == 0 185 | 186 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='8') 187 | assert ret == 0 188 | 189 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='16') 190 | assert ret == 0 191 | 192 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='32') 193 | assert ret == 0 194 | 195 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='64') 196 | assert ret == 0 197 | 198 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize='128') 199 | assert ret == 0 200 | 201 | # Test of Rabenseifner's algorithm is not necessary because chunksize is not used. 202 | 203 | def test_invalid_error(self): 204 | # Test if ibcomm checks chunk size 205 | int_size = 4 206 | for chunk_size in range(0, int_size): # try 0, 1, 2, 3 207 | # chunk_size must be a multiply of element type (which is int here) 208 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize=chunk_size) 209 | assert ret == IBCOMM_INVALID_ARGUMENT 210 | 211 | # Chunk size < 0 212 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize="-128") 213 | assert ret == IBCOMM_INVALID_ARGUMENT 214 | 215 | # Chunk size is too large 216 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="1k", chunksize="1k") 217 | assert ret == IBCOMM_INVALID_ARGUMENT 218 | 219 | # Check too short vector 220 | int_size = 4 221 | ret = AllreduceTest.check(algo=ALGO_RING, np=2, buffsize=int_size, init_expr="1", check_expr="np") 222 | assert ret == IBCOMM_NOT_SUPPORTED 223 | 224 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=2, buffsize=int_size, init_expr="1", check_expr="np") 225 | assert ret == IBCOMM_NOT_SUPPORTED 226 | 227 | @pytest.mark.slow 228 | def test_aging(self): 229 | for i in range(100): 230 | ret = AllreduceTest.check(algo=ALGO_RING, np=4, buffsize="128M") 231 | assert ret == 0 232 | 233 | for i in range(100): 234 | ret = AllreduceTest.check(algo=ALGO_RABEN, np=4, buffsize="128M") 235 | assert ret == 0 236 | -------------------------------------------------------------------------------- /tests/sendrecv_test.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #include 4 | #include 5 | #include 6 | #include "ibcomm/ibverbs_communicator.h" 7 | 8 | int main(void) { 9 | IBVerbsCommunicator comm(2); 10 | 11 | ProcessInfo pinfoA = comm.CreateQueuePair(1); 12 | ProcessInfo pinfoB = comm.RegisterProcess(0, pinfoA); 13 | comm.RegisterQueuePair(1, pinfoB); 14 | 15 | int value = 10; 16 | int value2 = -1; 17 | 18 | comm.Send(0, &value, sizeof(value), false); 19 | comm.Recv(1, &value2, sizeof(value2)); 20 | comm.SendWait(0); 21 | 22 | assert(value == value2); 23 | 24 | value2 = -1; 25 | comm.Send(0, &value2, sizeof(value), false); 26 | comm.Recv(1, &value, sizeof(value2)); 27 | comm.SendWait(0); 28 | 29 | assert(value == value2); 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /tests/unittest.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017-2018 by Preferred Networks, Inc. All right reserved. 2 | 3 | #include "gtest/gtest.h" 4 | #include "ibcomm/util.h" 5 | 6 | namespace { 7 | 8 | class IBCommUtilTest : public ::testing::Test { 9 | protected: 10 | }; 11 | 12 | TEST_F(IBCommUtilTest, ParseNumberZero) { 13 | EXPECT_EQ(0, util::parse_number("0")); 14 | EXPECT_EQ(0, util::parse_number("0b")); 15 | EXPECT_EQ(0, util::parse_number("0B")); 16 | EXPECT_EQ(0, util::parse_number("0k")); 17 | EXPECT_EQ(0, util::parse_number("0kb")); 18 | EXPECT_EQ(0, util::parse_number("0K")); 19 | EXPECT_EQ(0, util::parse_number("0m")); 20 | EXPECT_EQ(0, util::parse_number("0mb")); 21 | EXPECT_EQ(0, util::parse_number("0M")); 22 | EXPECT_EQ(0, util::parse_number("0g")); 23 | EXPECT_EQ(0, util::parse_number("0gb")); 24 | EXPECT_EQ(0, util::parse_number("0G")); 25 | 26 | EXPECT_EQ(0, util::parse_number("-0")); 27 | EXPECT_EQ(0, util::parse_number("-0b")); 28 | EXPECT_EQ(0, util::parse_number("-0B")); 29 | EXPECT_EQ(0, util::parse_number("-0k")); 30 | EXPECT_EQ(0, util::parse_number("-0kb")); 31 | EXPECT_EQ(0, util::parse_number("-0K")); 32 | EXPECT_EQ(0, util::parse_number("-0m")); 33 | EXPECT_EQ(0, util::parse_number("-0mb")); 34 | EXPECT_EQ(0, util::parse_number("-0M")); 35 | EXPECT_EQ(0, util::parse_number("-0g")); 36 | EXPECT_EQ(0, util::parse_number("-0gb")); 37 | EXPECT_EQ(0, util::parse_number("-0G")); 38 | } 39 | 40 | TEST_F(IBCommUtilTest, ParseNumberPositive) { 41 | EXPECT_EQ(1, util::parse_number("1")); 42 | EXPECT_EQ(1, util::parse_number("1b")); 43 | EXPECT_EQ(1, util::parse_number("1B")); 44 | 45 | EXPECT_EQ(1024, util::parse_number("1k")); 46 | EXPECT_EQ(1024, util::parse_number("1kb")); 47 | 48 | EXPECT_EQ(31 * 1024, util::parse_number("31k")); 49 | EXPECT_EQ(31 * 1024, util::parse_number("31kb")); 50 | 51 | EXPECT_EQ(713ul * 1024 * 1024, util::parse_number("713m")); 52 | EXPECT_EQ(713ul * 1024 * 1024, util::parse_number("713mb")); 53 | } 54 | 55 | TEST_F(IBCommUtilTest, ParseNumberMalformed) { 56 | ASSERT_THROW(util::parse_number("0.5"), util::MalformedNumber); 57 | ASSERT_THROW(util::parse_number("a"), util::MalformedNumber); 58 | ASSERT_THROW(util::parse_number("b"), util::MalformedNumber); 59 | ASSERT_THROW(util::parse_number("B"), util::MalformedNumber); 60 | ASSERT_THROW(util::parse_number("0x"), util::MalformedNumber); 61 | ASSERT_THROW(util::parse_number("97MiB"), 62 | util::MalformedNumber); // "Mibi byte" is not supported 63 | } 64 | 65 | TEST_F(IBCommUtilTest, get_exp_of_two) { 66 | ASSERT_EQ(0, util::get_exp_of_two(0)); 67 | ASSERT_EQ(1, util::get_exp_of_two(2)); 68 | ASSERT_EQ(7, util::get_exp_of_two(128)); 69 | ASSERT_EQ(0, util::get_exp_of_two(127)); 70 | ASSERT_EQ(0, util::get_exp_of_two(129)); 71 | } 72 | 73 | } // namespace 74 | --------------------------------------------------------------------------------