├── .gitmodules ├── README.md ├── omnireduce-DPDK ├── README.md ├── build_all.sh ├── daiet │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── example │ │ ├── CMakeLists.txt │ │ ├── daiet.cfg │ │ └── main.cpp │ ├── experiments │ │ ├── exp1 │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── common.h │ │ │ └── main.cc │ │ └── exp2 │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── common.h │ │ │ ├── float16.cc │ │ │ ├── float32.cc │ │ │ ├── int32.cc │ │ │ └── switchml_dense.cc │ ├── ps │ │ ├── Makefile │ │ ├── ps.cfg │ │ └── src │ │ │ ├── common.cpp │ │ │ ├── common.hpp │ │ │ ├── dpdk.h │ │ │ ├── main.cpp │ │ │ ├── msgs.h │ │ │ ├── params.cpp │ │ │ ├── params.hpp │ │ │ ├── ps.cpp │ │ │ ├── ps.hpp │ │ │ ├── stats.cpp │ │ │ ├── stats.hpp │ │ │ ├── utils.cpp │ │ │ └── utils.hpp │ ├── scripts │ │ └── dpdk-config.sh │ └── src │ │ ├── DaietContext.cpp │ │ ├── DaietContext.hpp │ │ ├── common.cpp │ │ ├── common.hpp │ │ ├── daiet.cpp │ │ ├── daiet.hpp │ │ ├── dpdk.h │ │ ├── msgs.h │ │ ├── params.cpp │ │ ├── params.hpp │ │ ├── ps.cpp │ │ ├── ps.hpp │ │ ├── stats.cpp │ │ ├── stats.hpp │ │ ├── utils.cpp │ │ ├── utils.hpp │ │ ├── worker.cpp │ │ └── worker.hpp ├── docker │ ├── Dockerfile │ └── aggregator_Dockerfile ├── environment.yml ├── get_cuda_arch_code.sh ├── gloo.patch ├── prepare.sh └── pytorch.patch └── omnireduce-RDMA ├── Makefile ├── README.md ├── docker ├── Dockerfile └── README.md ├── docs └── tutorial.md ├── example ├── Makefile ├── README.md ├── aggregator_test.cpp ├── cuda_worker_test.cpp ├── omnireduce.cfg └── worker_test.cpp ├── frameworks_integration ├── horovod_patch │ ├── README.md │ ├── omnireduce-horovod.patch │ ├── test_hvd_tensorflow.py │ └── test_hvd_torch.py └── pytorch_patch │ ├── README.md │ └── omnireduce-pytorch.patch └── omnireduce ├── aggcontext.cpp ├── aggcontext.hpp ├── aggregator.cpp ├── aggregator.hpp ├── common.cpp ├── common.hpp ├── context.cpp ├── context.hpp ├── cuda_utils.cu ├── cuda_utils.hpp ├── omnireduce.cpp ├── omnireduce.hpp ├── params.cpp ├── params.hpp ├── worker.cpp └── worker.hpp /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "omnireduce-DPDK/gloo"] 2 | path = omnireduce-DPDK/gloo 3 | url = https://github.com/facebookincubator/gloo.git 4 | [submodule "omnireduce-DPDK/pytorch"] 5 | path = omnireduce-DPDK/pytorch 6 | url = https://github.com/pytorch/pytorch.git 7 | [submodule "omnireduce-DPDK/daiet/lib/dpdk"] 8 | path = omnireduce-DPDK/daiet/lib/dpdk 9 | url = https://github.com/sands-lab/dpdk.git 10 | [submodule "omnireduce-DPDK/daiet/lib/vcl"] 11 | path = omnireduce-DPDK/daiet/lib/vcl 12 | url = https://github.com/vcoda/vectorclass.git 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OmniReduce 2 | OmniReduce is an efficient sparse collective communication library. It maximizes effective bandwidth use by exploiting the sparsity of data. 3 | 4 | For clusters without RDMA support, OmniReduce uses Intel DPDK for kernel bypass. GPUDirect can also be used where available. 5 | 6 | ## Contents 7 | - omnireduce-DPDK: source code of DPDK-based OmniReduce 8 | - omnireduce-RDMA: source code of RDMA-based OmniReduce 9 | - [experiments](https://github.com/sands-lab/omnireduce-experiments): micro-benchmark and end-to-end scripts 10 | 11 | ## Publications 12 | 13 | [OmniReduce](https://sands.kaust.edu.sa/project/omnireduce/) accepted at SIGCOMM’21. 14 | -------------------------------------------------------------------------------- /omnireduce-DPDK/README.md: -------------------------------------------------------------------------------- 1 | # OmniReduce-DPDK 2 | 3 | ## prepare submodules 4 | ```bash 5 | ./prepare.sh [--depth=10] # optional --depth shallow copys submodules 6 | ``` 7 | 8 | ## create conda environment 9 | ```bash 10 | conda env create --prefix ../env --file environment.yml 11 | ``` 12 | 13 | ## build 14 | ```bash 15 | conda activate ../env 16 | ./build_all.sh MLX5 CONDA INSTALL NOSCALING PYTORCH HOROVOD 17 | ``` 18 | 19 | ## offload bitmap (only supports PyTorch) 20 | ```bash 21 | conda activate ../env 22 | ./build_all.sh MLX5 CONDA INSTALL OFFLOAD_BITMAP NOSCALING PYTORCH 23 | ``` 24 | -------------------------------------------------------------------------------- /omnireduce-DPDK/build_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # FLAGS: INSTALL MLX5 COLOCATED LATENCIES TIMESTAMPS TIMERS DEBUG CONDA OFFLOAD_BITMAP NOSCALING PYTORCH ALGO2 COUNTERS NO_FILL_STORE 4 | set -e 5 | set -x 6 | 7 | CWD="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 8 | DPDK_ARGS='-fPIC ' 9 | DAIET_ARGS='' 10 | EXP_ARGS='' 11 | PS_ARGS='' 12 | GLOO_CMAKE_ARGS='' 13 | 14 | if [[ $@ == *'CONDA'* ]]; then 15 | echo "will install libraries to ${CONDA_PREFIX:-'/'}" 16 | THIS_TIME=`date` 17 | echo "build_all.sh invoked at ${THIS_TIME} with $@" > ${CONDA_PREFIX}/build-info.txt 18 | fi 19 | 20 | if [[ $@ == *'MLX5'* ]]; then 21 | echo 'MLX5 SUPPORT' 22 | EXP_ARGS+='-DUSE_MLX5=1 ' 23 | fi 24 | if [[ $@ == *'MLX4'* ]]; then 25 | echo 'MLX4 SUPPORT' 26 | EXP_ARGS+='-DUSE_MLX4=1 ' 27 | fi 28 | if [[ $@ == *'COLOCATED'* ]]; then 29 | echo 'COLOCATED SET' 30 | DAIET_ARGS+='COLOCATED=ON ' 31 | fi 32 | if [[ $@ == *'LATENCIES'* ]]; then 33 | echo 'LATENCIES SET' 34 | DAIET_ARGS+='LATENCIES=ON ' 35 | fi 36 | if [[ $@ == *'TIMESTAMPS'* ]]; then 37 | echo 'TIMESTAMPS SET' 38 | DAIET_ARGS+='TIMESTAMPS=ON ' 39 | fi 40 | if [[ $@ == *'COUNTERS'* ]]; then 41 | echo 'COUNTERS SET' 42 | DAIET_ARGS+='COUNTERS=ON ' 43 | fi 44 | if [[ $@ == *'ALGO2'* ]]; then 45 | echo 'ALGO2 SET' 46 | DAIET_ARGS+='ALGO2=ON ' 47 | PS_ARGS+='ALGO2=ON ' 48 | fi 49 | if [[ $@ == *'TIMERS'* ]]; then 50 | echo 'TIMERS SET' 51 | DAIET_ARGS+='TIMERS=ON ' 52 | PS_ARGS+='TIMERS=ON ' 53 | fi 54 | if [[ $@ == *'NO_FILL_STORE'* ]]; then 55 | echo 'NO_FILL_STORE SET' 56 | DAIET_ARGS+='NO_FILL_STORE=ON ' 57 | fi 58 | if [[ $@ == *'DEBUG'* ]]; then 59 | echo 'DEBUG SET' 60 | DAIET_ARGS+='DEBUG=ON COUNTERS=ON ' 61 | DPDK_ARGS+='-g -O0 ' 62 | PS_ARGS+='DEBUG=ON ' 63 | EXP_ARGS+='-DDEBUG=1 ' 64 | fi 65 | if [[ $@ == *'CONDA'* ]]; then 66 | GLOO_CMAKE_ARGS+='-DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" ' 67 | GLOO_CMAKE_ARGS+="-DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} " 68 | EXP_ARGS+='-DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1"' 69 | DAIET_EXTRA_CXX_FLAGS+="-I${CONDA_PREFIX}/include -L${CONDA_PREFIX}/lib " 70 | fi 71 | if [[ $@ == *'OFFLOAD_BITMAP'* ]]; then 72 | echo 'OFFLOAD_BITMAP SET' 73 | DAIET_ARGS+='OFFLOAD_BITMAP=ON ' 74 | OFFLOAD_BITMAP=1 75 | else 76 | OFFLOAD_BITMAP=0 77 | fi 78 | if [[ $@ == *'NOSCALING'* ]]; then 79 | echo 'NOSCALING SET' 80 | DAIET_ARGS+='NOSCALING=ON ' 81 | PS_ARGS+='NOSCALING=ON ' 82 | fi 83 | 84 | # Build DPDK 85 | cd $CWD/daiet/lib/dpdk/ 86 | 87 | if [[ $@ != *'SKIP_DPDK'* ]]; then 88 | rm -rf build 89 | 90 | if [[ $@ == *'MLX5'* ]]; then 91 | sed -i 's/CONFIG_RTE_LIBRTE_MLX5_PMD=n/CONFIG_RTE_LIBRTE_MLX5_PMD=y/' config/common_base 92 | else 93 | sed -i 's/CONFIG_RTE_LIBRTE_MLX5_PMD=y/CONFIG_RTE_LIBRTE_MLX5_PMD=n/' config/common_base 94 | fi 95 | if [[ $@ == *'MLX4'* ]]; then 96 | sed -i 's/CONFIG_RTE_LIBRTE_MLX4_PMD=n/CONFIG_RTE_LIBRTE_MLX4_PMD=y/' config/common_base 97 | else 98 | sed -i 's/CONFIG_RTE_LIBRTE_MLX4_PMD=y/CONFIG_RTE_LIBRTE_MLX4_PMD=n/' config/common_base 99 | fi 100 | 101 | make defconfig T=x86_64-native-linuxapp-gcc 102 | make EXTRA_CFLAGS="${DPDK_ARGS}" -j 103 | 104 | if [[ $@ == *'INSTALL'* ]]; then 105 | if [[ $@ == *'CONDA'* ]]; then 106 | make install-sdk install-runtime prefix=${CONDA_PREFIX} 107 | else 108 | make install 109 | fi 110 | fi 111 | fi 112 | 113 | 114 | if [[ $@ != *'SKIP_DAIET'* ]]; then 115 | cd $CWD/daiet 116 | # Build DAIET 117 | make clean 118 | rm -rf build 119 | EXTRA_CXX_FLAGS=${DAIET_EXTRA_CXX_FLAGS} make ${DAIET_ARGS} -j 120 | if [[ $@ == *'INSTALL'* ]]; then 121 | if [[ $@ == *'CONDA'* ]]; then 122 | make libinstall PREFIX=${CONDA_PREFIX} 123 | else 124 | make libinstall 125 | fi 126 | fi 127 | fi 128 | 129 | if [[ $@ != *'SKIP_GLOO'* ]]; then 130 | cd $CWD/gloo 131 | # Build Gloo 132 | rm -rf build 133 | mkdir build 134 | cd build 135 | 136 | if [[ $@ == *'DEBUG'* ]]; then 137 | CXXFLAGS='-g -O0' cmake -DUSE_DAIET=1 -DUSE_REDIS=1 -DUSE_AVX=1 -DUSE_MPI=1 $GLOO_CMAKE_ARGS .. 138 | else 139 | cmake -DBUILD_TEST=OFF -DBUILD_BENCHMARK=OFF -DUSE_DAIET=1 -DUSE_REDIS=1 -DUSE_AVX=1 -DUSE_MPI=1 $GLOO_CMAKE_ARGS .. 140 | fi 141 | 142 | make -j 143 | if [[ $@ == *'INSTALL'* ]]; then 144 | cd $CWD/gloo/build 145 | if [[ $@ == *'CONDA'* ]]; then 146 | cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} .. 147 | fi 148 | make install 149 | fi 150 | fi 151 | 152 | 153 | # Build experiments 154 | if [[ $@ != *'SKIP_EXPS'* ]]; then 155 | cd $CWD/daiet/experiments/exp1/ 156 | mkdir -p build 157 | cd build 158 | find . ! -name 'daiet.cfg' ! -name '.' ! -name '..' -exec rm -rf {} + 159 | cmake ${EXP_ARGS} .. 160 | make -j 161 | fi 162 | 163 | if [[ $@ != *'SKIP_EXPS'* ]]; then 164 | cd $CWD/daiet/experiments/exp2/ 165 | mkdir -p build 166 | cd build 167 | find . ! -name 'daiet.cfg' ! -name '.' ! -name '..' -exec rm -rf {} + 168 | cmake ${EXP_ARGS} .. 169 | make -j 170 | fi 171 | 172 | # Build example 173 | if [[ $@ != *'SKIP_EXAMPLE'* ]]; then 174 | cd $CWD/daiet/example 175 | mkdir -p build 176 | cd build 177 | find . ! -name 'daiet.cfg' ! -name '.' ! -name '..' -exec rm -rf {} + 178 | cmake ${EXP_ARGS} .. 179 | make -j 180 | fi 181 | 182 | # Build dedicated PS 183 | if [[ $@ != *'SKIP_PS'* ]]; then 184 | cd $CWD/daiet/ps 185 | make clean 186 | make ${PS_ARGS} -j 187 | fi 188 | 189 | # Build PyTorch 190 | if [[ $@ == *'PYTORCH'* ]]; then 191 | cd $CWD/pytorch 192 | OFFLOAD_BITMAP=$OFFLOAD_BITMAP BUILD_TEST=0 BUILD_CAFFE2=0 USE_SYSTEM_NCCL=1 NCCL_INCLUDE_DIR=${CONDA_PREFIX}/include NCCL_LIB_DIR=${CONDA_PREFIX}/lib ${CONDA_PREFIX}/bin/python setup.py install --prefix=${CONDA_PREFIX} --record=`basename ${CONDA_PREFIX}`_files.txt 193 | fi 194 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | /build 3 | /lib/dpdk/build 4 | /example/build 5 | /ps/build 6 | /experiments/exp1/build 7 | /experiments/exp2/build 8 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/Makefile: -------------------------------------------------------------------------------- 1 | # DAIET project 2 | # author: amedeo.sapio@kaust.edu.sa 3 | 4 | ifeq ($(DAIET_PATH),) 5 | DAIET_PATH = $(shell pwd) 6 | export DAIET_PATH 7 | endif 8 | 9 | RTE_SDK = ${DAIET_PATH}/lib/dpdk 10 | RTE_TARGET = build 11 | 12 | include $(RTE_SDK)/mk/rte.vars.mk 13 | 14 | # App name 15 | APPNAME = daiet 16 | 17 | # binary name 18 | LIB = libdaiet.a 19 | 20 | # install directory 21 | PREFIX = /usr/local 22 | 23 | # all source are stored in SRCS-y 24 | SRCS-y := $(shell find ${DAIET_PATH}/src -maxdepth 1 -name "*.cpp") 25 | HDRS := $(shell find ${DAIET_PATH}/src -maxdepth 1 -name "*.hpp" -o -name "*.h") 26 | 27 | #SIMDFLAGS = -msse2 -mssse3 -msse4.1 -msse4.2 -mavx -fabi-version=0 -mfma -mavx2 -mavx512f -mavx512dq -mavx512cd -mavx512bw -mavx512vl 28 | CXXFLAGS += -Wall -Wextra -std=c++11 -fPIC -I ${DAIET_PATH}/../gloo/ -I ${DAIET_PATH}/lib/ 29 | LDFLAGS += -lstdc++ -l boost_program_options 30 | 31 | ifeq ($(COLOCATED),ON) 32 | $(info "COLOCATED ON") 33 | CXXFLAGS += -DCOLOCATED 34 | endif 35 | 36 | ifeq ($(MLX),ON) 37 | $(info "MLX ON") 38 | CXXFLAGS += -DMLX 39 | endif 40 | 41 | ifeq ($(LATENCIES),ON) 42 | $(info "LATENCIES ON") 43 | CXXFLAGS += -DLATENCIES 44 | endif 45 | 46 | ifeq ($(TIMERS),ON) 47 | $(info "TIMERS ON") 48 | CXXFLAGS += -DTIMERS 49 | endif 50 | 51 | ifeq ($(NOSCALING),ON) 52 | $(info "NOSCALING ON") 53 | CXXFLAGS += -DNOSCALING 54 | endif 55 | 56 | ifeq ($(ALGO2),ON) 57 | $(info "ALGO2 ON") 58 | CXXFLAGS += -DALGO2 59 | endif 60 | 61 | ifeq ($(COUNTERS),ON) 62 | $(info "COUNTERS ON") 63 | CXXFLAGS += -DCOUNTERS 64 | endif 65 | 66 | ifeq ($(NO_FILL_STORE),ON) 67 | $(info "NO_FILL_STORE ON") 68 | CXXFLAGS += -DNO_FILL_STORE 69 | endif 70 | 71 | ifeq ($(DEBUG),ON) 72 | $(info "DEBUG ON") 73 | CXXFLAGS += -DDEBUG -g -O0 74 | else 75 | CXXFLAGS += -O3 76 | endif 77 | 78 | ifeq ($(OFFLOAD_BITMAP),ON) 79 | $(info "OFFLOAD BITMAP ON") 80 | CXXFLAGS += -DOFFLOAD_BITMAP 81 | endif 82 | 83 | .PHONY: local_install 84 | local_install: _postbuild 85 | $(Q)$(MAKE) clean 86 | $(RM) build/_postbuild 87 | $(RM) _postclean 88 | mkdir -p build/include/$(APPNAME) 89 | cp $(HDRS) build/include/$(APPNAME) 90 | 91 | include $(RTE_SDK)/mk/rte.extlib.mk 92 | 93 | distclean: clean 94 | $(RM) -r build 95 | 96 | .PHONY: libinstall 97 | libinstall: 98 | mkdir -p $(DESTDIR)$(PREFIX)/lib 99 | mkdir -p $(DESTDIR)$(PREFIX)/include/$(APPNAME) 100 | cp $(RTE_OUTPUT)/$(LIB) $(DESTDIR)$(PREFIX)/lib/$(LIB) 101 | cp $(HDRS) $(DESTDIR)$(PREFIX)/include/$(APPNAME) 102 | 103 | .PHONY: libuninstall 104 | libuninstall: 105 | $(RM) $(DESTDIR)$(PREFIX)/lib/$(LIB) 106 | $(RM) -r $(DESTDIR)$(PREFIX)/include/$(APPNAME) 107 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/README.md: -------------------------------------------------------------------------------- 1 | # DAIET 2 | ### Install dependencies 3 | - Dependencies: 4 | `make, coreutils, gcc, libc headers, kernel headers, libnuma-dev, Python version 2.7+ or 3.2+, kmod, pciutils, build-essential, boost-program-options, cmake, libhiredis-dev` 5 | 6 | In Debian/Ubuntu: 7 | ```sh 8 | apt install -f make \ 9 | coreutils \ 10 | gcc \ 11 | libc6-dev \ 12 | linux-headers-$(uname -r) \ 13 | libnuma-dev \ 14 | python \ 15 | kmod \ 16 | pciutils \ 17 | build-essential \ 18 | libboost-program-options-dev \ 19 | libboost-all-dev \ 20 | cmake \ 21 | libhiredis-dev 22 | ``` 23 | 24 | In Fedora: 25 | ```sh 26 | dnf install make \ 27 | automake \ 28 | coreutils \ 29 | gcc \ 30 | gcc-c++ \ 31 | glibc-devel \ 32 | kernel-devel \ 33 | kernel-headers \ 34 | numactl-devel \ 35 | python \ 36 | kmod \ 37 | pciutils \ 38 | boost-program-options 39 | ``` 40 | 41 | ### DPDK Setup 42 | - See [here](https://doc.dpdk.org/guides/linux_gsg/sys_reqs.html) 43 | 44 | ### Compile DPDK 45 | - Run: 46 | ```sh 47 | cd lib/dpdk 48 | make defconfig T=x86_64-native-linuxapp-gcc 49 | make -j 50 | sudo make install 51 | cd ../.. 52 | ``` 53 | 54 | ### Bind the interfaces 55 | - Set the name of the interface to bind with DPDK in `dpdk-config.sh` 56 | - Run: `. ./dpdk-config.sh` 57 | 58 | ### Compile the DAIET library 59 | - Run: 60 | ```sh 61 | make -j 62 | sudo make libinstall 63 | ``` 64 | 65 | ### Configuration 66 | - Configuration parameters (e.g., number of workers, IPs and ports) in `daiet.cfg` 67 | 68 | ## Utils 69 | 70 | - Get the hugepage size: 71 | ```sh 72 | awk '/Hugepagesize/ {print $2}' /proc/meminfo 73 | ``` 74 | 75 | - Get the total huge page numbers: 76 | ```sh 77 | awk '/HugePages_Total/ {print $2} ' /proc/meminfo 78 | ``` 79 | 80 | - Unmount the hugepages: 81 | ```sh 82 | umount `awk '/hugetlbfs/ {print $2}' /proc/mounts` 83 | ``` 84 | 85 | - Mount hugepage folder: 86 | ```sh 87 | mkdir -p /mnt/huge 88 | mount -t hugetlbfs nodev /mnt/huge 89 | ``` 90 | 91 | - Check the CPU layout using using the DPDK cpu\_layout utility: 92 | ```sh 93 | cd lib/dpdk 94 | usertools/cpu_layout.py 95 | ``` 96 | 97 | - Check your NIC id and related socket id: 98 | ```sh 99 | # List all the NICs with PCI address and device IDs. 100 | lspci -nn | grep Eth 101 | ``` 102 | - Check the PCI device related numa node id: 103 | ```sh 104 | cat /sys/bus/pci/devices/0000\:xx\:00.x/numa_node 105 | ``` 106 | Usually 0x:00.x is on socket 0 and 8x:00.x is on socket 1. 107 | Note: To get the best performance, ensure that the core and NICs are in the same socket. In the example above 85:00.0 is on socket 1 and should be used by cores on socket 1 for the best performance. 108 | 109 | > Note: 110 | > to support C++ applications, DPDK is patched with: 111 | > ```sh 112 | > cd lib/dpdk 113 | > patch -p1 < cpp_support.patch 114 | > ``` 115 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(switchml) 2 | cmake_minimum_required(VERSION 3.5) 3 | 4 | set(USE_MLX5_DEFAULT OFF) 5 | set(USE_MLX4_DEFAULT OFF) 6 | set(DEBUG_DEFAULT OFF) 7 | 8 | # Options 9 | option(USE_MLX5 "Use MLX5 and ibverbs" ${USE_MLX5_DEFAULT}) 10 | option(USE_MLX4 "Use MLX4 and ibverbs" ${USE_MLX4_DEFAULT}) 11 | option(DEBUG "Compile in debug mode" ${DEBUG_DEFAULT}) 12 | 13 | if(DEBUG) 14 | message(WARNING "Compile in debug mode") 15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -g -O0") 16 | else() 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -O3 -DNDEBUG") 18 | endif() 19 | 20 | add_definitions("-DHIREDIS_NESTED_INCLUDE") 21 | 22 | include_directories(${CMAKE_SOURCE_DIR}/../../gloo) 23 | include_directories(${CMAKE_SOURCE_DIR}/../build/include) 24 | link_directories(${CMAKE_SOURCE_DIR}/../build) 25 | link_directories(${CMAKE_SOURCE_DIR}/../lib/dpdk/build/lib) 26 | 27 | add_executable(example main.cpp) 28 | 29 | target_link_libraries(example -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options pthread) 30 | 31 | if(USE_MLX5) 32 | target_link_libraries(example ibverbs mlx5 mnl) 33 | endif() 34 | if(USE_MLX4) 35 | target_link_libraries(example ibverbs mlx4 mnl) 36 | endif() 37 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/example/daiet.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # DAIET project 3 | # author: amedeo.sapio@kaust.edu.sa 4 | # 5 | 6 | [daiet] 7 | # Number of workers 8 | num_workers = 8 9 | # Weights per packet 10 | num_updates = 35 11 | # Maximum number of pending messages 12 | max_num_pending_messages = 256 13 | # Worker UDP port 14 | worker_port = 4000 15 | # Parameter Server UDP port 16 | ps_port = 48864 17 | # Worker IP 18 | worker_ip = 10.0.0.1 19 | # Parameter Server IP and MACS 20 | ps_ips = 10.0.0.101, 10.0.0.102, 10.0.0.103, 10.0.0.104, 10.0.0.105, 10.0.0.106, 10.0.0.107, 10.0.0.108 21 | ps_macs = 0c:c4:7a:63:76:ab, 0c:c4:7a:63:76:ed, 0c:c4:7a:63:75:51, 0c:c4:7a:63:78:33, 0c:c4:7a:63:76:eb, 0c:c4:7a:63:76:dd, 0c:c4:7a:63:76:e3, 0c:c4:7a:63:78:31 22 | 23 | [dpdk] 24 | # Number of cores 25 | cores = 0-3 26 | # Process prefix 27 | prefix = daiet 28 | # Extra EAL options 29 | extra_eal_options = 30 | # Port id 31 | port_id = 0 32 | # Pool and pool cache sizes 33 | pool_size = 262144 34 | pool_cache_size = 512 35 | # Number of packets in a burst 36 | burst_rx = 64 37 | burst_tx = 64 38 | # Bulk drain timer (microseconds) 39 | bulk_drain_tx_us = 10 40 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/example/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace daiet; 9 | using namespace std; 10 | 11 | void signal_handler(int signum) { 12 | if (signum == SIGINT || signum == SIGTERM) { 13 | cout << " Signal " << signum << " received, preparing to exit..."; 14 | exit(EXIT_SUCCESS); 15 | } 16 | } 17 | 18 | int main() { 19 | 20 | DaietContext& ctx = DaietContext::getInstance(); 21 | 22 | long int count = 1024 * 1024 * 200; 23 | int num_rounds = 10; 24 | int num_workers = 2; 25 | float base_value = 1.2; // must be [1,2[ 26 | double accuracy=0.0001; 27 | int min_exp = -126 + log2(num_workers * count * num_rounds); 28 | int max_exp = 127 - log2(num_workers * count * num_rounds); 29 | int exp = min_exp; 30 | 31 | int faulty = 0, neg = 1, base_value_int = (base_value-1)*10; 32 | double avg_err = 0, err = 0; 33 | 34 | int32_t* p = new int32_t[count]; 35 | int32_t expected_int; 36 | 37 | float* fp = new float[count]; 38 | float expected_float; 39 | 40 | /* Set signal handler */ 41 | signal(SIGINT, signal_handler); 42 | signal(SIGTERM, signal_handler); 43 | 44 | for (int jj = 1; jj <= num_rounds; jj++) { 45 | 46 | std::cout << "INT round " << jj << std::endl; 47 | 48 | faulty = 0; 49 | neg = 1; 50 | 51 | for (int i = 0; i < count; i++) { 52 | p[i] = neg * base_value_int * jj * i; 53 | 54 | neg = -neg; 55 | } 56 | 57 | auto begin = std::chrono::high_resolution_clock::now(); 58 | if (!ctx.try_daiet(p, count,1)){ 59 | cout << "Daiet failed"; 60 | exit(EXIT_FAILURE); 61 | } 62 | auto end = std::chrono::high_resolution_clock::now(); 63 | 64 | neg = 1; 65 | for (int i = 0; i < count; i++) { 66 | 67 | expected_int = neg * base_value_int * jj * i * num_workers; 68 | 69 | if (p[i] != expected_int) { 70 | faulty++; 71 | std::cerr << "Index: " << i 72 | << " Received: " << p[i] 73 | << " Expected: " << expected_int << std::endl; 74 | } 75 | 76 | neg = -neg; 77 | } 78 | 79 | std::cout << "Done INT round " << jj 80 | << ": Faulty: " << faulty 81 | << " Time: " << std::chrono::duration_cast(end - begin).count() 82 | << " ms" << std::endl; 83 | } 84 | 85 | for (int jj = 1; jj <= num_rounds; jj++) { 86 | 87 | std::cout << "FLOAT round " << jj << std::endl; 88 | 89 | faulty = 0; 90 | neg = 1; 91 | 92 | for (int i = 0; i < count; i++) { 93 | 94 | fp[i] = neg * ldexpf(base_value,exp) * jj * i; 95 | 96 | neg = -neg; 97 | exp++; 98 | 99 | if (exp > max_exp) 100 | exp=min_exp; 101 | } 102 | 103 | auto begin = std::chrono::high_resolution_clock::now(); 104 | if (!ctx.try_daiet(fp, count,1)){ 105 | cout << "Daiet failed"; 106 | exit(EXIT_FAILURE); 107 | } 108 | 109 | auto end = std::chrono::high_resolution_clock::now(); 110 | 111 | neg = 1; 112 | 113 | for (int i = 0; i < count; i++) { 114 | 115 | expected_float = neg * ldexpf(base_value,exp) * jj * i * num_workers; 116 | 117 | err = abs(expected_float - fp[i]) / abs(expected_float); 118 | 119 | if (err > accuracy){ 120 | 121 | faulty++; 122 | avg_err += err; 123 | 124 | std::cerr << "Index: " << i 125 | << " Received: " << fp[i] 126 | << " Expected: " << expected_float 127 | << " Error: " << err*100<<"%"< max_exp) 134 | exp=min_exp; 135 | } 136 | 137 | avg_err = avg_err * 100 / count; 138 | 139 | std::cout << "Done FLOAT round " << jj 140 | << ": Faulty: " << faulty 141 | << " AVG err: "<< avg_err 142 | <<"% Time: " << std::chrono::duration_cast(end - begin).count() 143 | << " ms" << std::endl; 144 | } 145 | 146 | exit(EXIT_SUCCESS); 147 | } 148 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(switchml) 2 | cmake_minimum_required(VERSION 3.5) 3 | 4 | set(USE_VANILLA_DEFAULT OFF) 5 | set(USE_MLX5_DEFAULT OFF) 6 | set(USE_MLX4_DEFAULT OFF) 7 | set(DEBUG_DEFAULT OFF) 8 | 9 | # Options 10 | option(USE_VANILLA "Use vanilla version of gloo" ${USE_VANILLA_DEFAULT}) 11 | option(USE_MLX5 "Use MLX5 and ibverbs" ${USE_MLX5_DEFAULT}) 12 | option(USE_MLX4 "Use MLX4 and ibverbs" ${USE_MLX4_DEFAULT}) 13 | option(DEBUG "Compile in debug mode" ${DEBUG_DEFAULT}) 14 | 15 | if(DEBUG) 16 | message(WARNING "Compile in debug mode") 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -g -O0") 18 | else() 19 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -O3 -DNDEBUG") 20 | endif() 21 | 22 | add_definitions("-DHIREDIS_NESTED_INCLUDE") 23 | 24 | link_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build/gloo) 25 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build) 26 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo) 27 | 28 | if(NOT USE_VANILLA) 29 | include_directories(${CMAKE_SOURCE_DIR}/../../build/include) 30 | link_directories(${CMAKE_SOURCE_DIR}/../../build) 31 | link_directories(${CMAKE_SOURCE_DIR}/../../lib/dpdk/build/lib) 32 | else() 33 | message(WARNING "Compiling with vanilla gloo") 34 | endif() 35 | 36 | add_executable(exp1 main.cc) 37 | 38 | if(NOT USE_VANILLA) 39 | target_link_libraries(exp1 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options) 40 | endif() 41 | 42 | if(USE_MLX5) 43 | target_link_libraries(exp1 ibverbs mlx5 mnl) 44 | endif() 45 | 46 | if(USE_MLX4) 47 | target_link_libraries(exp1 ibverbs mlx4 mnl) 48 | endif() 49 | 50 | target_link_libraries(exp1 gloo hiredis pthread) 51 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp1/README.md: -------------------------------------------------------------------------------- 1 | # DEPENDENDENCIES 2 | apt install libboost-chrono-dev libboost-system-dev libboost-thread-dev 3 | 4 | # COMPILE 5 | Build gloo with the "-DUSE\_REDIS=ON" option 6 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp1/common.h: -------------------------------------------------------------------------------- 1 | #include "malloc.h" 2 | // Align buffers to 32 bytes to support vectorized code 3 | const size_t kBufferAlignment = 32; 4 | 5 | template 6 | class aligned_allocator { 7 | public: 8 | using value_type = T; 9 | using pointer = value_type*; 10 | using const_pointer = const value_type*; 11 | using reference = value_type&; 12 | using const_reference = const value_type&; 13 | using size_type = std::size_t; 14 | using difference_type = std::ptrdiff_t; 15 | 16 | template 17 | struct rebind { 18 | using other = aligned_allocator; 19 | }; 20 | 21 | inline explicit aligned_allocator() = default; 22 | inline ~aligned_allocator() = default; 23 | inline explicit aligned_allocator(const aligned_allocator& a) = default; 24 | 25 | inline pointer address(reference r) { 26 | return &r; 27 | } 28 | inline const_pointer address(const_reference r) { 29 | return &r; 30 | } 31 | 32 | inline pointer allocate( 33 | size_type sz, 34 | typename std::allocator::const_pointer = 0) { 35 | auto x = memalign(ALIGNMENT, sizeof(T) * sz); 36 | return reinterpret_cast(x); 37 | } 38 | 39 | void deallocate(pointer p, size_type /*sz*/) { 40 | free(p); 41 | } 42 | }; 43 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp1/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "gloo/barrier_all_to_one.h" 10 | #include "gloo/allreduce_halving_doubling.h" 11 | #include "gloo/rendezvous/context.h" 12 | #include "gloo/rendezvous/redis_store.h" 13 | #include "gloo/rendezvous/prefix_store.h" 14 | #include "gloo/transport/tcp/device.h" 15 | #if GLOO_USE_IBVERBS 16 | #include "gloo/transport/ibverbs/device.h" 17 | #endif 18 | 19 | #include "common.h" 20 | 21 | using namespace std; 22 | 23 | shared_ptr context; 24 | 25 | void signal_handler(int signum) { 26 | 27 | if (signum == SIGINT || signum == SIGTERM) { 28 | 29 | cerr << " Signal " << signum << " received!"; 30 | 31 | #ifdef DAIET 32 | context->daietContext.StopMaster(); 33 | #endif 34 | exit(1); 35 | } 36 | } 37 | 38 | int main(int argc, char* argv[]) { 39 | 40 | if (argc != 8) { 41 | #if GLOO_USE_IBVERBS 42 | cout << " Usage: " << argv[0] << " [rdma:|tcp:]INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl; 43 | #else 44 | cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl; 45 | #endif 46 | return 0; 47 | } 48 | 49 | /* Set signal handler */ 50 | signal(SIGINT, signal_handler); 51 | signal(SIGTERM, signal_handler); 52 | 53 | vector> data; 54 | int roundnum = 0; 55 | 56 | // GLOO transport 57 | std::shared_ptr dev; 58 | 59 | #if GLOO_USE_IBVERBS 60 | if (strncmp("rdma:", argv[1], 5) == 0) { 61 | string name(argv[1] + 5); 62 | gloo::transport::ibverbs::attr attr = { 63 | .name = name, 64 | .port = 1, 65 | .index = 0, 66 | }; 67 | dev = gloo::transport::ibverbs::CreateDevice(attr); 68 | } else { 69 | if (strncmp("tcp:", argv[1], 4) == 0) { 70 | argv[1] += 4; 71 | } 72 | string iface(argv[1]); 73 | gloo::transport::tcp::attr attr; 74 | attr.iface = iface; 75 | dev = gloo::transport::tcp::CreateDevice(attr); 76 | } 77 | #else 78 | gloo::transport::tcp::attr attr; 79 | string iface(argv[1]); 80 | attr.iface = iface; 81 | dev = gloo::transport::tcp::CreateDevice(attr); 82 | #endif 83 | 84 | // Rendezvous 85 | auto redisStore = gloo::rendezvous::RedisStore(argv[2]); 86 | string prefix = argv[3]; 87 | auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore); 88 | 89 | const int size = atoi(argv[4]); 90 | const int rank = atoi(argv[5]); 91 | const int tensor_size = atoi(argv[6]); 92 | const int num_rounds = atoi(argv[7]); 93 | 94 | // Init data 95 | data.reserve(tensor_size); 96 | cout << "-- Tensor initialization" << endl; 97 | for (int i = 0; i < tensor_size; i++) { 98 | data.insert(data.begin()+i, 1); 99 | } 100 | cout << "---- Ended" << endl; 101 | 102 | vector ptrs; 103 | ptrs.push_back(&data[0]); 104 | 105 | int count = data.size(); 106 | 107 | // Context 108 | context = make_shared(rank, size); 109 | context->connectFullMesh(prefixStore, dev); 110 | 111 | auto barrier = make_shared(context); 112 | 113 | barrier->run(); 114 | 115 | //Warm up rounds 116 | for (int i=0; i<10; i++){ 117 | auto allreduce = make_shared>(context, ptrs, count); 118 | allreduce->run(); 119 | } 120 | 121 | // Start rounds 122 | for (roundnum = 0; roundnum < num_rounds; roundnum++) { 123 | // Instantiate the collective algorithm 124 | auto allreduce = make_shared>(context, ptrs, count); 125 | 126 | cout << "-- Allreduce Round " << roundnum << endl; 127 | 128 | auto begin = chrono::high_resolution_clock::now(); 129 | // Run the algorithm 130 | allreduce->run(); 131 | 132 | auto end = chrono::high_resolution_clock::now(); 133 | 134 | cout << "---- Ended" << endl << "#ms " << chrono::duration_cast(end - begin).count() << endl; 135 | 136 | usleep(100000); 137 | } 138 | 139 | cout << "-- Final check" << endl; 140 | for (int i = 0; i < tensor_size; i++) { 141 | if (data[i] != powf(size, num_rounds+10)) { 142 | cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << powf(size, num_rounds+10) << endl; 143 | break; 144 | } 145 | } 146 | cout << "---- Ended" << endl; 147 | 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(switchml) 2 | cmake_minimum_required(VERSION 3.5) 3 | find_package(MPI) 4 | 5 | set(USE_VANILLA_DEFAULT OFF) 6 | set(USE_MLX5_DEFAULT OFF) 7 | set(USE_MLX4_DEFAULT OFF) 8 | set(DEBUG_DEFAULT OFF) 9 | 10 | # Options 11 | option(USE_VANILLA "Use vanilla version of gloo" ${USE_VANILLA_DEFAULT}) 12 | option(USE_MLX5 "Use MLX5 and ibverbs" ${USE_MLX5_DEFAULT}) 13 | option(USE_MLX4 "Use MLX4 and ibverbs" ${USE_MLX4_DEFAULT}) 14 | option(DEBUG "Compile in debug mode" ${DEBUG_DEFAULT}) 15 | 16 | if(DEBUG) 17 | message(WARNING "Compile in debug mode") 18 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -g -O0") 19 | else() 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -O3 -DNDEBUG") 21 | endif() 22 | 23 | add_definitions("-DHIREDIS_NESTED_INCLUDE") 24 | 25 | link_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build/gloo) 26 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build) 27 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo) 28 | include_directories(SYSTEM ${MPI_INCLUDE_PATH}) 29 | 30 | if(NOT USE_VANILLA) 31 | include_directories(${CMAKE_SOURCE_DIR}/../../build/include) 32 | link_directories(${CMAKE_SOURCE_DIR}/../../build) 33 | link_directories(${CMAKE_SOURCE_DIR}/../../lib/dpdk/build/lib) 34 | else() 35 | message(WARNING "Compiling with vanilla gloo") 36 | endif() 37 | 38 | add_executable(float16 float16.cc) 39 | add_executable(float32 float32.cc) 40 | add_executable(int32 int32.cc) 41 | add_executable(switchml_dense switchml_dense.cc) 42 | 43 | 44 | if(NOT USE_VANILLA) 45 | target_link_libraries(float16 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options) 46 | target_link_libraries(float32 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options) 47 | target_link_libraries(int32 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options) 48 | target_link_libraries(switchml_dense -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options) 49 | endif() 50 | 51 | if(USE_MLX5) 52 | target_link_libraries(float16 ibverbs mlx5 mnl) 53 | target_link_libraries(float32 ibverbs mlx5 mnl) 54 | target_link_libraries(int32 ibverbs mlx5 mnl) 55 | target_link_libraries(switchml_dense ibverbs mlx5 mnl) 56 | endif() 57 | if(USE_MLX4) 58 | target_link_libraries(float16 ibverbs mlx4 mnl) 59 | target_link_libraries(float32 ibverbs mlx4 mnl) 60 | target_link_libraries(int32 ibverbs mlx4 mnl) 61 | target_link_libraries(switchml_dense ibverbs mlx4 mnl) 62 | endif() 63 | 64 | target_link_libraries(float16 gloo hiredis pthread) 65 | target_link_libraries(float32 gloo hiredis pthread) 66 | target_link_libraries(int32 gloo hiredis pthread) 67 | target_link_libraries(switchml_dense gloo hiredis pthread ${MPI_C_LIBRARIES}) 68 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/README.md: -------------------------------------------------------------------------------- 1 | # DEPENDENDENCIES 2 | apt install libboost-chrono-dev libboost-system-dev libboost-thread-dev 3 | 4 | # COMPILE 5 | Build gloo with the "-DUSE\_REDIS=ON" option 6 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/common.h: -------------------------------------------------------------------------------- 1 | #include "malloc.h" 2 | // Align buffers to 32 bytes to support vectorized code 3 | const size_t kBufferAlignment = 32; 4 | 5 | template 6 | class aligned_allocator { 7 | static_assert( 8 | !(ALIGNMENT & (ALIGNMENT - 1)), 9 | "alignment must be a power of 2"); 10 | 11 | public: 12 | using value_type = T; 13 | using pointer = value_type*; 14 | using const_pointer = const value_type*; 15 | using reference = value_type&; 16 | using const_reference = const value_type&; 17 | using size_type = std::size_t; 18 | using difference_type = std::ptrdiff_t; 19 | 20 | template 21 | struct rebind { 22 | using other = aligned_allocator; 23 | }; 24 | 25 | inline explicit aligned_allocator() = default; 26 | inline ~aligned_allocator() = default; 27 | inline explicit aligned_allocator(const aligned_allocator& a) = default; 28 | 29 | inline pointer address(reference r) { 30 | return &r; 31 | } 32 | 33 | inline const_pointer address(const_reference r) { 34 | return &r; 35 | } 36 | 37 | inline pointer allocate( 38 | size_type sz, 39 | typename std::allocator::const_pointer = 0) { 40 | pointer p; 41 | if (posix_memalign( 42 | reinterpret_cast(&p), ALIGNMENT, sizeof(T) * sz)) { 43 | abort(); 44 | } 45 | return p; 46 | } 47 | 48 | void deallocate(pointer p, size_type /*sz*/) { 49 | free(p); 50 | } 51 | }; 52 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/float16.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "gloo/allreduce_halving_doubling.h" 7 | #include "gloo/rendezvous/context.h" 8 | #include "gloo/rendezvous/redis_store.h" 9 | #include "gloo/rendezvous/prefix_store.h" 10 | #include "gloo/transport/tcp/device.h" 11 | #include "gloo/barrier_all_to_one.h" 12 | #include "gloo/types.h" 13 | 14 | #include 15 | 16 | #include "common.h" 17 | 18 | using namespace std; 19 | 20 | shared_ptr context; 21 | 22 | void signal_handler(int signum) { 23 | 24 | if (signum == SIGINT || signum == SIGTERM) { 25 | 26 | cerr << " Signal " << signum << " received!"; 27 | 28 | #ifdef DAIET 29 | context->daietContext.StopMaster(); 30 | #endif 31 | exit(1); 32 | 33 | } 34 | } 35 | 36 | int main(int argc, char* argv[]) { 37 | 38 | if (argc != 8) { 39 | cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl; 40 | return 0; 41 | } 42 | 43 | /* Set signal handler */ 44 | signal(SIGINT, signal_handler); 45 | signal(SIGTERM, signal_handler); 46 | 47 | vector> base_data; 48 | vector> data; 49 | int roundnum = 0; 50 | 51 | gloo::float16 elem = gloo::cpu_float2half_rn(0.01), expected; 52 | 53 | // GLOO transport 54 | gloo::transport::tcp::attr attr; 55 | attr.iface = argv[1]; 56 | auto dev = gloo::transport::tcp::CreateDevice(attr); 57 | 58 | // Rendezvous 59 | auto redisStore = gloo::rendezvous::RedisStore(argv[2]); 60 | string prefix = argv[3]; 61 | auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore); 62 | 63 | const int size = atoi(argv[4]); 64 | const int rank = atoi(argv[5]); 65 | const int tensor_size = atoi(argv[6]); 66 | const int num_rounds = atoi(argv[7]); 67 | int num_last_rounds = 0; 68 | 69 | // Init data 70 | base_data.reserve(tensor_size); 71 | data.resize(tensor_size); 72 | cout << "-- Tensor initialization" << endl; 73 | for (int i = 0; i < tensor_size; i++) { 74 | base_data.insert(base_data.begin() + i, gloo::cpu_float2half_rn(i%100)*elem); 75 | } 76 | copy(base_data.begin(), base_data.end(), data.begin()); 77 | cout << "---- Ended" << endl; 78 | 79 | vector ptrs; 80 | ptrs.push_back(&data[0]); 81 | 82 | int count = data.size(); 83 | 84 | // Context 85 | context = make_shared(rank, size); 86 | context->connectFullMesh(prefixStore, dev); 87 | 88 | auto barrier = make_shared(context); 89 | 90 | barrier->run(); 91 | 92 | //Warm up rounds 93 | for (int i = 0; i < 10; i++) { 94 | auto allreduce = make_shared>(context, ptrs, count); 95 | allreduce->run(); 96 | } 97 | copy(base_data.begin(), base_data.end(), data.begin()); 98 | 99 | // Start rounds 100 | for (roundnum = 0; roundnum < num_rounds; roundnum++) { 101 | 102 | if (roundnum % 10 == 0) { 103 | copy(base_data.begin(), base_data.end(), data.begin()); 104 | num_last_rounds = 0; 105 | } 106 | 107 | // Instantiate the collective algorithm 108 | auto allreduce = make_shared>(context, ptrs, count); 109 | 110 | cout << "-- Allreduce Round " << roundnum << endl; 111 | 112 | auto begin = chrono::high_resolution_clock::now(); 113 | // Run the algorithm 114 | allreduce->run(); 115 | 116 | auto end = chrono::high_resolution_clock::now(); 117 | 118 | cout << "---- Ended" << endl << "#ms " << chrono::duration_cast(end - begin).count() << endl; 119 | num_last_rounds++; 120 | 121 | } 122 | 123 | cout << "-- Final check" << endl; 124 | for (int i = 0; i < tensor_size; i++) { 125 | expected = (i%100) * gloo::cpu_half2float(elem) * powf(size, num_last_rounds); 126 | if (data[i] != expected) { 127 | cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl; 128 | break; 129 | } 130 | } 131 | cout << "---- Ended" << endl; 132 | 133 | return 0; 134 | } 135 | 136 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/float32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "gloo/allreduce_halving_doubling.h" 7 | #include "gloo/rendezvous/context.h" 8 | #include "gloo/rendezvous/redis_store.h" 9 | #include "gloo/rendezvous/prefix_store.h" 10 | #include "gloo/transport/tcp/device.h" 11 | #include "gloo/barrier_all_to_one.h" 12 | #include "gloo/allreduce.h" 13 | 14 | #include 15 | 16 | #include "common.h" 17 | 18 | using namespace std; 19 | 20 | shared_ptr context; 21 | 22 | void signal_handler(int signum) { 23 | 24 | if (signum == SIGINT || signum == SIGTERM) { 25 | 26 | cerr << " Signal " << signum << " received!"; 27 | 28 | #ifdef DAIET 29 | context->daietContext.StopMaster(); 30 | #endif 31 | exit(1); 32 | } 33 | } 34 | 35 | int main(int argc, char* argv[]) { 36 | 37 | if (argc != 8) { 38 | cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl; 39 | return 0; 40 | } 41 | 42 | /* Set signal handler */ 43 | signal(SIGINT, signal_handler); 44 | signal(SIGTERM, signal_handler); 45 | 46 | vector> base_data; 47 | vector> data; 48 | int roundnum = 0; 49 | 50 | float elem = 0.01, expected = 0; 51 | 52 | // GLOO transport 53 | gloo::transport::tcp::attr attr; 54 | attr.iface = argv[1]; 55 | auto dev = gloo::transport::tcp::CreateDevice(attr); 56 | 57 | // Rendezvous 58 | auto redisStore = gloo::rendezvous::RedisStore(argv[2]); 59 | string prefix = argv[3]; 60 | auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore); 61 | 62 | const int size = atoi(argv[4]); 63 | const int rank = atoi(argv[5]); 64 | const int tensor_size = atoi(argv[6]); 65 | const int num_rounds = atoi(argv[7]); 66 | int num_last_rounds = 0; 67 | 68 | // Init data 69 | base_data.reserve(tensor_size); 70 | data.resize(tensor_size); 71 | cout << "-- Tensor initialization" << endl; 72 | for (int i = 0; i < tensor_size; i++) { 73 | base_data.insert(base_data.begin() + i, (i%100)*elem); 74 | } 75 | copy(base_data.begin(), base_data.end(), data.begin()); 76 | cout << "---- Ended" << endl; 77 | 78 | vector ptrs; 79 | ptrs.push_back(&data[0]); 80 | 81 | int count = data.size(); 82 | 83 | // Context 84 | context = make_shared(rank, size); 85 | context->connectFullMesh(prefixStore, dev); 86 | 87 | auto barrier = make_shared(context); 88 | 89 | barrier->run(); 90 | 91 | //Warm up rounds 92 | for (int i = 0; i < 10; i++) { 93 | gloo::AllreduceOptions opts(context); 94 | opts.setOutputs(ptrs, count); 95 | opts.setReduceFunction( 96 | static_cast( 97 | &gloo::sum)); 98 | gloo::allreduce(opts); 99 | //auto allreduce = make_shared>(context, ptrs, count); 100 | //allreduce->run(); 101 | } 102 | copy(base_data.begin(), base_data.end(), data.begin()); 103 | 104 | // Start rounds 105 | for (roundnum = 0; roundnum < num_rounds; roundnum++) { 106 | 107 | if (roundnum % 10 == 0) { 108 | copy(base_data.begin(), base_data.end(), data.begin()); 109 | num_last_rounds = 0; 110 | } 111 | 112 | // Instantiate the collective algorithm 113 | //auto allreduce = make_shared>(context, ptrs, count); 114 | 115 | cout << "-- Allreduce Round " << roundnum << endl; 116 | 117 | auto begin = chrono::high_resolution_clock::now(); 118 | // Run the algorithm 119 | //allreduce->run(); 120 | 121 | gloo::AllreduceOptions opts(context); 122 | opts.setOutputs(ptrs, count); 123 | opts.setReduceFunction( 124 | static_cast( 125 | &gloo::sum)); 126 | gloo::allreduce(opts); 127 | auto end = chrono::high_resolution_clock::now(); 128 | 129 | cout << "---- Ended" << endl << "#ms " << chrono::duration_cast(end - begin).count() << endl; 130 | num_last_rounds++; 131 | 132 | } 133 | 134 | cout << "-- Final check" << endl; 135 | for (int i = 0; i < tensor_size; i++) { 136 | expected = (i%100) * elem * powf(size, num_last_rounds); 137 | if (data[i] != expected) { 138 | cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl; 139 | break; 140 | } 141 | } 142 | cout << "---- Ended" << endl; 143 | 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/int32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "gloo/allreduce_halving_doubling.h" 7 | #include "gloo/rendezvous/context.h" 8 | #include "gloo/rendezvous/redis_store.h" 9 | #include "gloo/rendezvous/prefix_store.h" 10 | #include "gloo/transport/tcp/device.h" 11 | #include "gloo/barrier_all_to_one.h" 12 | 13 | #include 14 | 15 | #include "common.h" 16 | 17 | using namespace std; 18 | 19 | shared_ptr context; 20 | 21 | void signal_handler(int signum) { 22 | 23 | if (signum == SIGINT || signum == SIGTERM) { 24 | 25 | cerr << " Signal " << signum << " received!"; 26 | 27 | #ifdef DAIET 28 | context->daietContext.StopMaster(); 29 | #endif 30 | exit(1); 31 | } 32 | } 33 | 34 | int main(int argc, char* argv[]) { 35 | 36 | if (argc != 8) { 37 | cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl; 38 | return 0; 39 | } 40 | 41 | /* Set signal handler */ 42 | signal(SIGINT, signal_handler); 43 | signal(SIGTERM, signal_handler); 44 | 45 | vector> base_data; 46 | vector> data; 47 | int roundnum = 0; 48 | 49 | int32_t elem = 1, expected = 0; 50 | 51 | // GLOO transport 52 | gloo::transport::tcp::attr attr; 53 | attr.iface = argv[1]; 54 | auto dev = gloo::transport::tcp::CreateDevice(attr); 55 | 56 | // Rendezvous 57 | auto redisStore = gloo::rendezvous::RedisStore(argv[2]); 58 | string prefix = argv[3]; 59 | auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore); 60 | 61 | const int size = atoi(argv[4]); 62 | const int rank = atoi(argv[5]); 63 | const int tensor_size = atoi(argv[6]); 64 | const int num_rounds = atoi(argv[7]); 65 | int num_last_rounds = 0; 66 | 67 | // Init data 68 | base_data.reserve(tensor_size); 69 | data.resize(tensor_size); 70 | cout << "-- Tensor initialization" << endl; 71 | for (int i = 0; i < tensor_size; i++) { 72 | base_data.insert(base_data.begin() + i, (i%100)*elem); 73 | } 74 | copy(base_data.begin(), base_data.end(), data.begin()); 75 | cout << "---- Ended" << endl; 76 | 77 | vector ptrs; 78 | ptrs.push_back(&data[0]); 79 | 80 | int count = data.size(); 81 | 82 | // Context 83 | context = make_shared(rank, size); 84 | context->connectFullMesh(prefixStore, dev); 85 | 86 | auto barrier = make_shared(context); 87 | 88 | barrier->run(); 89 | 90 | //Warm up rounds 91 | for (int i = 0; i < 10; i++) { 92 | auto allreduce = make_shared>(context, ptrs, count); 93 | allreduce->run(); 94 | } 95 | copy(base_data.begin(), base_data.end(), data.begin()); 96 | 97 | // Start rounds 98 | for (roundnum = 0; roundnum < num_rounds; roundnum++) { 99 | 100 | if (roundnum % 10 == 0) { 101 | copy(base_data.begin(), base_data.end(), data.begin()); 102 | num_last_rounds = 0; 103 | } 104 | 105 | // Instantiate the collective algorithm 106 | auto allreduce = make_shared>(context, ptrs, count); 107 | 108 | cout << "-- Allreduce Round " << roundnum << endl; 109 | 110 | auto begin = chrono::high_resolution_clock::now(); 111 | // Run the algorithm 112 | allreduce->run(); 113 | 114 | auto end = chrono::high_resolution_clock::now(); 115 | 116 | cout << "---- Ended" << endl << "#ms " << chrono::duration_cast(end - begin).count() << endl; 117 | num_last_rounds++; 118 | 119 | } 120 | 121 | cout << "-- Final check" << endl; 122 | for (int i = 0; i < tensor_size; i++) { 123 | expected = (i%100) * elem * powf(size, num_last_rounds); 124 | if (data[i] != expected) { 125 | cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl; 126 | break; 127 | } 128 | } 129 | cout << "---- Ended" << endl; 130 | 131 | return 0; 132 | } 133 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/experiments/exp2/switchml_dense.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "gloo/allreduce_halving_doubling.h" 9 | #include "gloo/rendezvous/context.h" 10 | #include "gloo/rendezvous/redis_store.h" 11 | #include "gloo/rendezvous/prefix_store.h" 12 | #include "gloo/transport/tcp/device.h" 13 | #include "gloo/barrier_all_to_one.h" 14 | 15 | #include 16 | #include 17 | 18 | #include "mpi.h" 19 | #include "common.h" 20 | 21 | using namespace std; 22 | 23 | //#define SAVE_RESULT 24 | #define OUTPUT_RANK 0 25 | #define INTTYPE 26 | 27 | #ifdef INTTYPE 28 | typedef int ValType; 29 | #else 30 | typedef float ValType; 31 | #endif 32 | 33 | void set_seed(unsigned int seed) { 34 | srand(seed); 35 | srand48(seed); 36 | } 37 | 38 | void set_seed_random(int id) { 39 | set_seed(clock() + (id * 147)); 40 | } 41 | 42 | // Between 0 (included) and max (excluded) 43 | unsigned int get_random_int(unsigned int max) { 44 | return rand()%max; 45 | } 46 | 47 | // Between 0 (included) and max(excluded) 48 | float get_random_float(unsigned int max) { 49 | return drand48()*max; 50 | } 51 | 52 | ValType get_random_value() { 53 | #ifdef FLOATTYPE 54 | return get_random_float(100) - 50; 55 | #elif defined(INTTYPE) 56 | return get_random_int(200) - 100; // Change to int if this changes 57 | #endif 58 | } 59 | 60 | void create_sparse(const unsigned dim, const float density, ValType* v, const int blocksize) { 61 | // Create indices from 0 to dim 62 | 63 | int block_num = (int)(dim/blocksize); 64 | int count = (int)(density*block_num); 65 | std::vector indices(block_num); 66 | std::iota (indices.begin(), indices.end(), 0); 67 | 68 | // Random suffel indices 69 | std::random_shuffle ( indices.begin(), indices.end() ); 70 | // Sort first count items 71 | std::sort( indices.begin(), indices.begin() + count); 72 | 73 | size_t idx = 0; 74 | for(std::vector::const_iterator index = indices.begin(); index != indices.end() && index < indices.begin() + count; ++index) { 75 | for(int i=(*index)*blocksize;i<(*index+1)*blocksize; i++){ 76 | ValType val = get_random_value(); 77 | v[i]= val; 78 | } 79 | } 80 | return; 81 | } 82 | 83 | shared_ptr context; 84 | 85 | void signal_handler(int signum) { 86 | 87 | if (signum == SIGINT || signum == SIGTERM) { 88 | 89 | cerr << " Signal " << signum << " received!"; 90 | 91 | #ifdef DAIET 92 | context->daietContext.StopMaster(); 93 | #endif 94 | exit(1); 95 | } 96 | } 97 | 98 | int main(int argc, char* argv[]) { 99 | 100 | 101 | MPI_Init(&argc, &argv); 102 | if (argc != 9) { 103 | cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS DENSITY" << endl; 104 | return 0; 105 | } 106 | 107 | int myrank, worldsize; 108 | MPI_Comm_size(MPI_COMM_WORLD, &worldsize); 109 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank); 110 | 111 | /* Set signal handler */ 112 | signal(SIGINT, signal_handler); 113 | signal(SIGTERM, signal_handler); 114 | 115 | vector> base_data; 116 | vector> data; 117 | vector> results; 118 | 119 | 120 | int roundnum = 0; 121 | 122 | float elem = 0.01, expected = 0; 123 | 124 | // GLOO transport 125 | gloo::transport::tcp::attr attr; 126 | attr.iface = argv[1]; 127 | auto dev = gloo::transport::tcp::CreateDevice(attr); 128 | 129 | // Rendezvous 130 | auto redisStore = gloo::rendezvous::RedisStore(argv[2]); 131 | time_t t = time(0); 132 | char ch[64]; 133 | strftime(ch, sizeof(ch), "%Y-%m-%d-%H-%M-%S", localtime(&t)); 134 | string prefix = ch; 135 | auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore); 136 | 137 | const int size = worldsize; 138 | const int rank = myrank; 139 | const int tensor_size = atoi(argv[6]); 140 | const int num_rounds = atoi(argv[7]); 141 | const float density = atof(argv[8]); 142 | const int blocksize = 256; 143 | int num_last_rounds = 0; 144 | int* timecost = (int*)malloc(sizeof(int)*num_rounds); 145 | 146 | // Init data 147 | set_seed_random(rank); 148 | srand(time(NULL)); 149 | int cnt = (int)(density*tensor_size); 150 | base_data.resize(tensor_size); 151 | data.resize(tensor_size); 152 | results.resize(tensor_size); 153 | cout << "-- Tensor initialization" << endl; 154 | create_sparse(tensor_size, density, &base_data[0], blocksize); 155 | copy(base_data.begin(), base_data.end(), data.begin()); 156 | copy(base_data.begin(), base_data.end(), results.begin()); 157 | cout << "---- Ended" << endl; 158 | 159 | vector ptrs; 160 | ptrs.push_back(&data[0]); 161 | 162 | ValType* result_ptr = &results[0]; 163 | 164 | 165 | int count = data.size(); 166 | 167 | // Context 168 | context = make_shared(rank, size); 169 | context->connectFullMesh(prefixStore, dev); 170 | 171 | auto barrier = make_shared(context); 172 | 173 | barrier->run(); 174 | 175 | //Warm up rounds 176 | for (int i = 0; i < 10; i++) { 177 | auto allreduce = make_shared>(context, ptrs, count); 178 | allreduce->run(); 179 | } 180 | copy(base_data.begin(), base_data.end(), data.begin()); 181 | copy(base_data.begin(), base_data.end(), results.begin()); 182 | 183 | // Start rounds 184 | for (roundnum = 0; roundnum < num_rounds; roundnum++) { 185 | MPI_Barrier(MPI_COMM_WORLD); 186 | double t_mpi, maxT; 187 | if (roundnum % 5 == 0) { 188 | copy(base_data.begin(), base_data.end(), data.begin()); 189 | copy(base_data.begin(), base_data.end(), results.begin()); 190 | num_last_rounds = 0; 191 | } 192 | 193 | MPI_Allreduce(MPI_IN_PLACE, result_ptr, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 194 | 195 | // Instantiate the collective algorithm 196 | 197 | auto allreduce = make_shared>(context, ptrs, count); 198 | 199 | //cout << "-- Allreduce Round " << roundnum << endl; 200 | 201 | auto begin = chrono::high_resolution_clock::now(); 202 | // Run the algorithm 203 | t_mpi = -MPI_Wtime(); 204 | allreduce->run(); 205 | t_mpi += MPI_Wtime(); 206 | MPI_Reduce(&t_mpi, &maxT, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 207 | if (myrank==OUTPUT_RANK) { 208 | printf("switchml Dense Allreduce: %f secs\n", maxT); 209 | } 210 | 211 | auto end = chrono::high_resolution_clock::now(); 212 | 213 | //cout << "---- Ended" << endl << "#ms " << chrono::duration_cast(end - begin).count() << endl; 214 | num_last_rounds++; 215 | timecost[roundnum] = (int)(maxT*1000000); 216 | MPI_Barrier(MPI_COMM_WORLD); 217 | } 218 | 219 | cout << "-- Final check" << endl; 220 | for (int i = 0; i < tensor_size; i++) { 221 | //if (i<100) cout< received " << data[i] << " instead of " << expected << endl; 225 | break; 226 | } 227 | } 228 | cout << "---- Ended" << endl; 229 | #ifdef SAVE_RESULT 230 | if(rank==OUTPUT_RANK){ 231 | FILE *fp = NULL; 232 | char* filename = (char*)malloc(200*sizeof(char)); 233 | strcpy(filename, "result/switchML_Dense"); 234 | strcat(filename, "-"); 235 | char temp[20]; 236 | sprintf(temp, "%d", tensor_size); 237 | strcat(filename, temp); 238 | strcat(filename, "-"); 239 | char temp2[20]; 240 | sprintf(temp2, "%f", density); 241 | strcat(filename, temp2); 242 | strcat(filename, ".txt"); 243 | fp = fopen(filename, "w+"); 244 | fprintf(fp, "%s", "switchML_Dense\n"); 245 | for(int j=0; j 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "dpdk.h" 29 | #include "msgs.h" 30 | 31 | namespace daiet { 32 | 33 | extern volatile bool force_quit; 34 | } 35 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/dpdk.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/msgs.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | namespace daiet { 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | /** 15 | * DAIET Header 16 | */ 17 | struct daiet_hdr { 18 | uint32_t tsi; /**< tensor start index */ 19 | uint16_t pool_index; /**< pool index */ 20 | uint32_t next_tsi; /**< next tensor start index */ 21 | #ifdef NOSCALING 22 | uint8_t data_type; 23 | #endif 24 | }__attribute__((__packed__)); 25 | 26 | struct entry_hdr { 27 | int32_t upd; /**< vector entry */ 28 | }__attribute__((__packed__)); 29 | #ifndef NOSCALING 30 | struct exp_hdr { 31 | int16_t exp; /**< exponent */ 32 | }__attribute__((__packed__)); 33 | #endif 34 | #ifdef __cplusplus 35 | } 36 | #endif 37 | 38 | } // End namespace -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/params.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #include "params.hpp" 7 | #include 8 | 9 | namespace po = boost::program_options; 10 | 11 | namespace daiet { 12 | 13 | struct dpdk_data dpdk_data; 14 | struct dpdk_params dpdk_par; 15 | daiet_params daiet_par; 16 | 17 | void parse_parameters() { 18 | 19 | string config_file; 20 | ifstream ifs; 21 | uint16_t ps_port; 22 | uint32_t num_updates; 23 | 24 | po::options_description dpdk_options("DPDK options"); 25 | po::options_description daiet_options("DAIET options"); 26 | po::options_description config_file_options; 27 | 28 | dpdk_options.add_options() 29 | ("dpdk.cores", po::value(&dpdk_par.corestr)->default_value("0-2"), "List of cores") 30 | ("dpdk.prefix", po::value(&dpdk_par.prefix)->default_value("daiet"), "Process prefix") 31 | ("dpdk.extra_eal_options", po::value(&dpdk_par.eal_options)->default_value(""), "Extra EAL options") 32 | ("dpdk.port_id", po::value(&dpdk_par.portid)->default_value(0), "Port ID") 33 | ("dpdk.pool_size", po::value(&dpdk_par.pool_size)->default_value(8192 * 32), "Pool size") 34 | ("dpdk.pool_cache_size", po::value(&dpdk_par.pool_cache_size)->default_value(256 * 2), "Pool cache size") 35 | ("dpdk.burst_rx", po::value(&dpdk_par.burst_rx)->default_value(64), "RX burst size") 36 | ("dpdk.burst_tx", po::value(&dpdk_par.burst_tx)->default_value(64), "TX burst size") 37 | ("dpdk.bulk_drain_tx_us", po::value(&dpdk_par.bulk_drain_tx_us)->default_value(100), "TX bulk drain timer (us)"); 38 | 39 | daiet_options.add_options() 40 | ("daiet.ps_port", po::value(&ps_port)->default_value(48879), "PS UDP port") 41 | ("daiet.max_num_pending_messages", po::value(&(daiet_par.getMaxNumPendingMessages()))->default_value(256), "Max number of pending, unaggregated messages") 42 | ("daiet.num_updates", po::value(&num_updates)->default_value(32), "Number of updates per packet") 43 | ("daiet.num_workers", po::value(&(daiet_par.getNumWorkers()))->default_value(0), "Number of workers"); 44 | 45 | config_file_options.add(daiet_options).add(dpdk_options); 46 | 47 | config_file = "/etc/ps.cfg"; 48 | ifs.open(config_file.c_str()); 49 | if(!ifs.good()){ 50 | ifs.close(); 51 | 52 | char hostname[500]; 53 | if (gethostname(hostname,sizeof(hostname))!=0) 54 | LOG_FATAL("gethostname failed: "+ string(strerror(errno))); 55 | 56 | config_file = "ps-"+string(hostname)+".cfg"; 57 | ifs.open(config_file.c_str()); 58 | if(!ifs.good()){ 59 | ifs.close(); 60 | 61 | config_file = "ps.cfg"; 62 | ifs.open(config_file.c_str()); 63 | if(!ifs.good()){ 64 | ifs.close(); 65 | LOG_FATAL("No config file found! (/etc/ps.cfg, ps-"+string(hostname)+".cfg, ps.cfg)"); 66 | } 67 | } 68 | } 69 | LOG_INFO("Configuration file "+config_file); 70 | 71 | po::variables_map vm; 72 | po::store(po::parse_config_file(ifs, config_file_options), vm); 73 | po::notify(vm); 74 | 75 | daiet_par.setBasePsPort(ps_port); 76 | daiet_par.setNumUpdates(num_updates); 77 | 78 | if (daiet_par.getNumWorkers()<=0) 79 | LOG_FATAL("Number of workers must be greater than 0."); 80 | } 81 | 82 | void print_dpdk_params() { 83 | 84 | LOG_INFO("** DPDK parameters **"); 85 | LOG_INFO("Cores: " + dpdk_par.corestr); 86 | LOG_INFO("Port ID: " + to_string(dpdk_par.portid)); 87 | LOG_INFO("Port RX ring size: " + to_string(dpdk_par.port_rx_ring_size)); 88 | LOG_INFO("Port TX ring size: " + to_string(dpdk_par.port_tx_ring_size)); 89 | LOG_INFO("Pool size: " + to_string(dpdk_par.pool_size)); 90 | LOG_INFO("Pool cache size: " + to_string(dpdk_par.pool_cache_size)); 91 | LOG_INFO("Burst size RX: " + to_string(dpdk_par.burst_rx)); 92 | LOG_INFO("Burst size TX: " + to_string(dpdk_par.burst_tx)); 93 | LOG_INFO("Burst drain TX us: " + to_string(dpdk_par.bulk_drain_tx_us)); 94 | LOG_INFO("Prefix: " + dpdk_par.prefix); 95 | LOG_INFO("Extra EAL options: " + dpdk_par.eal_options); 96 | } 97 | 98 | daiet_params::daiet_params() { 99 | 100 | // Defaults 101 | num_updates = 32; 102 | 103 | max_num_pending_messages = 256; 104 | 105 | tx_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4 | PKT_TX_UDP_CKSUM; 106 | 107 | ps_port = 5000; 108 | 109 | num_workers = 0; 110 | } 111 | 112 | daiet_params::~daiet_params() { 113 | } 114 | 115 | void daiet_params::print_params() { 116 | 117 | LOG_INFO("** DAIET parameters **"); 118 | LOG_INFO("Num updates: " + to_string(num_updates)); 119 | LOG_INFO("Max num pending messages: " + to_string(max_num_pending_messages)); 120 | LOG_INFO("PS port: " + to_string(ps_port)); 121 | LOG_INFO("Num workers: " + to_string(num_workers)); 122 | } 123 | 124 | uint16_t& daiet_params::getNumWorkers() { 125 | return num_workers; 126 | } 127 | 128 | void daiet_params::setNumUpdates(uint32_t numUpdates) { 129 | num_updates = numUpdates; 130 | } 131 | 132 | void daiet_params::setBasePsPort(uint16_t psPort) { 133 | ps_port = psPort; 134 | 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/params.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "common.hpp" 9 | #include "utils.hpp" 10 | 11 | using namespace std; 12 | 13 | namespace daiet { 14 | 15 | void print_dpdk_params(); 16 | void parse_parameters(); 17 | 18 | struct dpdk_data { 19 | 20 | // Buffer pool size 21 | uint32_t pool_buffer_size; 22 | uint16_t core_to_thread_id[RTE_MAX_LCORE]; 23 | 24 | dpdk_data() { 25 | // Defaults 26 | 27 | pool_buffer_size = RTE_MBUF_DEFAULT_BUF_SIZE; 28 | } 29 | }__rte_cache_aligned; 30 | 31 | extern struct dpdk_data dpdk_data; 32 | 33 | struct dpdk_params { 34 | 35 | // Ports 36 | uint16_t portid; 37 | uint16_t port_rx_ring_size; 38 | uint16_t port_tx_ring_size; 39 | 40 | // Buffer pool 41 | uint32_t pool_size; 42 | uint32_t pool_cache_size; 43 | 44 | // Burst sizes 45 | uint32_t burst_rx; 46 | uint32_t burst_tx; 47 | uint32_t bulk_drain_tx_us; 48 | 49 | // Extra EAL options 50 | string eal_options; 51 | 52 | // Process prefix 53 | string prefix; 54 | 55 | // Cores string 56 | string corestr; 57 | 58 | dpdk_params() { 59 | // Defaults 60 | 61 | portid = 0; 62 | port_rx_ring_size = 1024; 63 | port_tx_ring_size = 1024; 64 | 65 | pool_size = 8192 * 32; 66 | pool_cache_size = 256 * 2; 67 | 68 | burst_rx = 64; 69 | burst_tx = 64; 70 | bulk_drain_tx_us = 10; 71 | 72 | prefix = "daiet"; 73 | eal_options = ""; 74 | 75 | corestr = ""; 76 | } 77 | }__rte_cache_aligned; 78 | 79 | extern struct dpdk_params dpdk_par; 80 | 81 | class daiet_params { 82 | private: 83 | 84 | uint32_t num_updates; 85 | 86 | uint32_t max_num_pending_messages; 87 | 88 | uint64_t tx_flags; 89 | 90 | uint16_t ps_port; 91 | 92 | uint16_t num_workers; 93 | 94 | public: 95 | daiet_params(); 96 | ~daiet_params(); 97 | 98 | void print_params(); 99 | 100 | uint16_t& getNumWorkers(); 101 | 102 | __rte_always_inline uint32_t getNumUpdates() const { 103 | return num_updates; 104 | } 105 | 106 | __rte_always_inline uint32_t& getMaxNumPendingMessages() { 107 | return max_num_pending_messages; 108 | } 109 | 110 | void setNumUpdates(uint32_t); 111 | 112 | __rte_always_inline int64_t getTxFlags() const { 113 | return tx_flags; 114 | } 115 | 116 | __rte_always_inline uint16_t getBasePsPort() const { 117 | return ps_port; 118 | } 119 | 120 | void setBasePsPort(uint16_t); 121 | }; 122 | 123 | extern daiet_params daiet_par; 124 | } 125 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/ps.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | namespace daiet { 9 | 10 | void ps_setup(); 11 | void ps_cleanup(); 12 | int ps(void*); 13 | } 14 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/stats.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #include "stats.hpp" 7 | #include "utils.hpp" 8 | 9 | namespace daiet { 10 | 11 | pkt_statistics pkt_stats; 12 | 13 | pkt_statistics::pkt_statistics() : total_ps_tx(0), total_ps_rx(0) { 14 | } 15 | 16 | void pkt_statistics::init(uint32_t nb_ps) { 17 | 18 | total_ps_tx = 0; 19 | total_ps_rx = 0; 20 | 21 | ps_tx.resize(nb_ps); 22 | ps_rx.resize(nb_ps); 23 | } 24 | 25 | void pkt_statistics::set_ps(uint32_t psid, uint64_t tx, uint64_t rx) { 26 | 27 | boost::unique_lock lock(ps_mutex); 28 | 29 | ps_tx[psid] = tx; 30 | ps_rx[psid] = rx; 31 | 32 | total_ps_tx += tx; 33 | total_ps_rx += rx; 34 | } 35 | 36 | void pkt_statistics::dump(){ 37 | 38 | LOG_INFO("PS TX " + to_string(total_ps_tx)); 39 | LOG_INFO("PS RX " + to_string(total_ps_rx)); 40 | 41 | for (uint32_t i = 0; i < ps_tx.size(); i++) { 42 | 43 | LOG_INFO("## PS" + to_string(i)); 44 | LOG_INFO("TX " + to_string(ps_tx[i])); 45 | LOG_INFO("RX " + to_string(ps_rx[i])); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/stats.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | 11 | #include "common.hpp" 12 | 13 | using namespace std; 14 | 15 | namespace daiet { 16 | 17 | class pkt_statistics { 18 | 19 | public: 20 | pkt_statistics(); 21 | void dump(); 22 | 23 | void init(uint32_t); 24 | void set_ps(uint32_t, uint64_t, uint64_t); 25 | 26 | private: 27 | 28 | boost::mutex ps_mutex; 29 | uint64_t total_ps_tx; 30 | uint64_t total_ps_rx; 31 | vector ps_tx; 32 | vector ps_rx; 33 | }; 34 | 35 | extern pkt_statistics pkt_stats; 36 | } 37 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/ps/src/utils.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "common.hpp" 18 | 19 | using namespace std; 20 | 21 | namespace daiet { 22 | 23 | extern std::ofstream daiet_log; 24 | 25 | template 26 | void LOG_FATAL(T) __attribute__((used)); 27 | template 28 | void LOG_ERROR(T) __attribute__((used)); 29 | template 30 | void LOG_INFO(T) __attribute__((used)); 31 | 32 | #ifdef DEBUG 33 | template 34 | void LOG_DEBUG(T) __attribute__((used)); 35 | #else 36 | #define LOG_DEBUG(T) 37 | #endif 38 | 39 | template 40 | string to_hex(T); 41 | 42 | vector split(const string &); 43 | vector split(const string &, const string &); 44 | 45 | string mac_to_str(const rte_ether_addr); 46 | string mac_to_str(const uint64_t, bool = true); 47 | int64_t str_to_mac(string const&, bool = true); 48 | string ip_to_str(uint32_t); 49 | 50 | void swap_eth_addr(rte_ether_hdr *); 51 | void deep_copy_single_segment_pkt(rte_mbuf*, const rte_mbuf*); 52 | void check_port_link_status(uint16_t); 53 | void print_packet(struct rte_ether_hdr *, uint16_t); 54 | void print_dev_info(struct rte_eth_dev_info&); 55 | void print_dev_stats(uint16_t); 56 | void print_dev_xstats(uint16_t); 57 | 58 | } // End namespace 59 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/scripts/dpdk-config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | IFACE="eno1" 6 | 7 | cwd=$(pwd) 8 | 9 | RTE_SDK=$cwd/../lib/dpdk 10 | RTE_TARGET=build 11 | 12 | cd $RTE_SDK/$RTE_TARGET 13 | 14 | modprobe uio 15 | insmod kmod/igb_uio.ko 16 | 17 | cd ../usertools 18 | 19 | ./dpdk-devbind.py --bind=igb_uio ${IFACE} 20 | 21 | cd $cwd 22 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/DaietContext.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #include "DaietContext.hpp" 7 | #include "daiet.hpp" 8 | #include "utils.hpp" 9 | #include "gloo/common/error.h" 10 | 11 | namespace daiet { 12 | 13 | void* DaietMaster(void *ctx) { 14 | 15 | DaietContext* d_ctx_ptr = (DaietContext *) ctx; 16 | 17 | d_ctx_ptr->ret = master(d_ctx_ptr); 18 | 19 | return NULL; 20 | } 21 | 22 | DaietContext::DaietContext() : 23 | num_worker_threads (1), master_ready(0), data_ready(0), results(0), tensor_update_ptr(NULL), result_id(0), one_msec(1) { 24 | 25 | tid_counter.store(0); 26 | StartMaster(); 27 | } 28 | 29 | DaietContext::~DaietContext() { 30 | 31 | StopMaster(); 32 | } 33 | 34 | void DaietContext::set_num_worker_threads(uint32_t nt){ 35 | num_worker_threads = nt; 36 | } 37 | 38 | void DaietContext::wait_master_ready() { 39 | boost::unique_lock lock(master_ready_mutex); 40 | 41 | while (master_ready!=num_worker_threads) 42 | master_ready_event.wait(lock); 43 | } 44 | 45 | void DaietContext::set_master_ready() { 46 | 47 | boost::unique_lock lock(master_ready_mutex); 48 | 49 | if ((++master_ready) == num_worker_threads) 50 | master_ready_event.notify_one(); 51 | } 52 | 53 | void DaietContext::send_tensor(TensorUpdate* tuptr) { 54 | boost::unique_lock lock(data_ready_mutex); 55 | 56 | while (data_ready!=0) 57 | data_pop_event.wait(lock); 58 | 59 | tensor_update_ptr = tuptr; 60 | data_ready = num_worker_threads; 61 | data_push_event.notify_all(); 62 | } 63 | 64 | bool DaietContext::receive_tensor(TensorUpdate& tu, uint16_t worker_id) { 65 | boost::unique_lock lock(data_ready_mutex); 66 | 67 | while (data_ready!=(uint32_t)(worker_id+1)) { 68 | if (data_push_event.wait_for(lock, one_msec) == boost::cv_status::timeout) 69 | return false; 70 | } 71 | 72 | tu = *tensor_update_ptr; // Copy 73 | 74 | if (data_ready != 1){ 75 | #ifdef OFFLOAD_BITMAP 76 | tu.block_count /= num_worker_threads; 77 | if (tu.block_count%num_worker_threads>worker_id) 78 | tu.block_count += 1; 79 | tu.count = tu.block_count * block_size; 80 | #else 81 | tu.count /= num_worker_threads; 82 | #endif 83 | } else { 84 | tu.count -= tu.start_idx; 85 | } 86 | 87 | tensor_update_ptr->start_idx += tu.count; 88 | 89 | if ((--data_ready) == 0) 90 | data_pop_event.notify_one(); 91 | 92 | return true; 93 | } 94 | 95 | bool DaietContext::send_result(const int32_t rid) { 96 | boost::unique_lock lock(result_mutex); 97 | 98 | while (results == num_worker_threads) { 99 | if (result_pop_event.wait_for(lock, one_msec) == boost::cv_status::timeout) 100 | return false; 101 | } 102 | 103 | if ((++results)==num_worker_threads) { 104 | result_id = rid; 105 | result_push_event.notify_all(); 106 | } 107 | 108 | return true; 109 | } 110 | 111 | void DaietContext::receive_result(const int32_t rid) { 112 | boost::unique_lock lock(result_mutex); 113 | 114 | while (results != num_worker_threads && result_id != rid) 115 | result_push_event.wait(lock); 116 | 117 | results = 0; 118 | result_id = 0; 119 | 120 | result_pop_event.notify_all(); 121 | } 122 | 123 | void DaietContext::StartMaster() { 124 | 125 | /* Launch dpdk master thread */ 126 | if (pthread_create(&masterThread, NULL, DaietMaster, this)) 127 | GLOO_THROW("Error starting master dpdk thread"); 128 | 129 | //Wait for EAL setup 130 | wait_master_ready(); 131 | } 132 | 133 | void DaietContext::StopMaster() { 134 | 135 | force_quit = true; 136 | 137 | int join_ret = pthread_join(masterThread, NULL); 138 | if (join_ret) 139 | GLOO_THROW("Error joining master dpdk thread: returned ", join_ret); 140 | 141 | if (this->ret < 0) 142 | GLOO_THROW("Master dpdk thread returned ", this->ret); 143 | 144 | } 145 | 146 | #ifdef OFFLOAD_BITMAP 147 | void DaietContext::AllReduce(gloo::float16* ptr, int count, uint8_t* bitmap_ptr, int block_count) { 148 | int32_t tensor_id = tid_counter.fetch_add(1)+1; 149 | TensorUpdate tu; 150 | tu.ptr = ptr; 151 | tu.count = count; 152 | tu.start_idx = 0; 153 | tu.id = tensor_id; 154 | tu.type = FLOAT16; 155 | tu.bitmap_ptr = bitmap_ptr; 156 | tu.block_count = block_count; 157 | send_tensor(&tu); 158 | receive_result(tensor_id); 159 | } 160 | #endif 161 | 162 | void DaietContext::AllReduce(gloo::float16* ptr, int count) { 163 | 164 | int32_t tensor_id = tid_counter.fetch_add(1)+1; 165 | TensorUpdate tu; 166 | tu.ptr = ptr; 167 | tu.count = count; 168 | tu.start_idx = 0; 169 | tu.id = tensor_id; 170 | tu.type = FLOAT16; 171 | 172 | send_tensor(&tu); 173 | receive_result(tensor_id); 174 | } 175 | 176 | #ifdef OFFLOAD_BITMAP 177 | void DaietContext::AllReduce(float* ptr, int count, uint8_t* bitmap_ptr, int block_count) { 178 | int32_t tensor_id = tid_counter.fetch_add(1)+1; 179 | TensorUpdate tu; 180 | tu.ptr = ptr; 181 | tu.count = count; 182 | tu.start_idx = 0; 183 | tu.id = tensor_id; 184 | tu.type = FLOAT32; 185 | tu.bitmap_ptr = bitmap_ptr; 186 | tu.block_count = block_count; 187 | send_tensor(&tu); 188 | receive_result(tensor_id); 189 | } 190 | #endif 191 | 192 | void DaietContext::AllReduce(float* ptr, int count) { 193 | 194 | int32_t tensor_id = tid_counter.fetch_add(1)+1; 195 | TensorUpdate tu; 196 | tu.ptr = ptr; 197 | tu.count = count; 198 | tu.start_idx = 0; 199 | tu.id = tensor_id; 200 | tu.type = FLOAT32; 201 | 202 | send_tensor(&tu); 203 | receive_result(tensor_id); 204 | } 205 | 206 | #ifdef OFFLOAD_BITMAP 207 | void DaietContext::AllReduce(int32_t* ptr, int count, uint8_t* bitmap_ptr, int block_count) { 208 | int32_t tensor_id = tid_counter.fetch_add(1)+1; 209 | TensorUpdate tu; 210 | tu.ptr = ptr; 211 | tu.count = count; 212 | tu.start_idx = 0; 213 | tu.id = tensor_id; 214 | tu.type = INT32; 215 | tu.bitmap_ptr = bitmap_ptr; 216 | tu.block_count = block_count; 217 | send_tensor(&tu); 218 | receive_result(tensor_id); 219 | } 220 | #endif 221 | 222 | void DaietContext::AllReduce(int32_t* ptr, int count) { 223 | 224 | int32_t tensor_id = tid_counter.fetch_add(1)+1; 225 | TensorUpdate tu; 226 | tu.ptr = ptr; 227 | tu.count = count; 228 | tu.start_idx = 0; 229 | tu.id = tensor_id; 230 | tu.type = INT32; 231 | 232 | send_tensor(&tu); 233 | receive_result(tensor_id); 234 | } 235 | 236 | bool DaietContext::try_daiet(gloo::float16* ptr, int count, int fn_) { 237 | if (fn_ == 1) { //sum 238 | 239 | AllReduce(ptr, count); 240 | 241 | return true; 242 | } 243 | 244 | return false; 245 | } 246 | 247 | bool DaietContext::try_daiet(float* ptr, int count, int fn_) { 248 | if (fn_ == 1) { //sum 249 | 250 | AllReduce(ptr, count); 251 | 252 | return true; 253 | } 254 | 255 | return false; 256 | } 257 | 258 | bool DaietContext::try_daiet(int32_t* ptr, int count, int fn_) { 259 | if (fn_ == 1) { //sum 260 | 261 | AllReduce(ptr, count); 262 | 263 | return true; 264 | } 265 | 266 | return false; 267 | } 268 | 269 | bool DaietContext::try_daiet(__attribute__((unused)) void* ptr, __attribute__((unused)) int count, __attribute__((unused)) int fn_) { 270 | 271 | return false; 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/DaietContext.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #define DAIET 9 | 10 | #include 11 | #include 12 | #include "gloo/types.h" 13 | 14 | namespace daiet { 15 | 16 | void *DaietMaster(void *ctx); 17 | 18 | enum TensorUpdateType { 19 | NONE = 0, INT32 = 1, FLOAT32 = 2, FLOAT16 = 3 20 | }; 21 | 22 | struct TensorUpdate { 23 | void* ptr; 24 | int count; 25 | int start_idx; 26 | int32_t id; 27 | TensorUpdateType type; 28 | #ifdef OFFLOAD_BITMAP 29 | uint8_t* bitmap_ptr; 30 | int block_count; 31 | #endif 32 | }; 33 | 34 | /* Singleton class*/ 35 | class DaietContext { 36 | public: 37 | 38 | static DaietContext& getInstance() { 39 | // Guaranteed to be destroyed and instantiated on first use. 40 | static DaietContext instance; 41 | return instance; 42 | } 43 | 44 | DaietContext(DaietContext const&) = delete; 45 | void operator=(DaietContext const&) = delete; 46 | 47 | void wait_master_ready(); 48 | void set_master_ready(); 49 | void set_num_worker_threads(uint32_t); 50 | 51 | void receive_result(const int32_t); 52 | bool send_result(const int32_t); 53 | bool receive_tensor(TensorUpdate&, uint16_t); 54 | void send_tensor(TensorUpdate*); 55 | 56 | void StartMaster(); 57 | void StopMaster(); 58 | 59 | #ifdef OFFLOAD_BITMAP 60 | void AllReduce(gloo::float16*, int, uint8_t*, int); 61 | void AllReduce(float*, int, uint8_t*, int); 62 | void AllReduce(int32_t*, int, uint8_t*, int); 63 | static const uint32_t block_size = 256; 64 | #endif 65 | void AllReduce(gloo::float16*, int); 66 | void AllReduce(float*, int); 67 | void AllReduce(int32_t*, int); 68 | 69 | bool try_daiet(gloo::float16*, int, int); 70 | bool try_daiet(float*, int, int); 71 | bool try_daiet(int32_t*, int, int); 72 | bool try_daiet(void*, int, int); 73 | 74 | friend void *DaietMaster(void*); 75 | 76 | private: 77 | 78 | DaietContext(); 79 | virtual ~DaietContext(); 80 | 81 | pthread_t masterThread; 82 | int ret; 83 | 84 | std::atomic_uint_fast32_t tid_counter; 85 | boost::mutex master_ready_mutex, data_ready_mutex, result_mutex; 86 | boost::condition_variable master_ready_event, data_push_event, data_pop_event, result_push_event, result_pop_event; 87 | uint32_t num_worker_threads; 88 | 89 | // Shared 90 | uint32_t master_ready; 91 | uint32_t data_ready; 92 | uint32_t results; 93 | TensorUpdate* tensor_update_ptr; 94 | int32_t result_id; 95 | // *** 96 | 97 | boost::chrono::milliseconds one_msec; 98 | }; 99 | } 100 | 101 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/common.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #include "common.hpp" 7 | 8 | namespace daiet { 9 | 10 | volatile bool force_quit; 11 | } 12 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/common.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "dpdk.h" 29 | #include "msgs.h" 30 | 31 | namespace daiet { 32 | 33 | extern volatile bool force_quit; 34 | } 35 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/daiet.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "DaietContext.hpp" 9 | 10 | namespace daiet { 11 | 12 | int master(DaietContext* dctx); 13 | } 14 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/dpdk.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/msgs.h: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | namespace daiet { 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | /** 15 | * DAIET Header 16 | */ 17 | struct daiet_hdr { 18 | uint32_t tsi; /**< tensor start index */ 19 | uint16_t pool_index; /**< pool index */ 20 | uint32_t next_tsi; /**< next tensor start index */ 21 | #ifdef NOSCALING 22 | uint8_t data_type; 23 | #endif 24 | }__attribute__((__packed__)); 25 | 26 | struct entry_hdr { 27 | int32_t upd; /**< vector entry */ 28 | }__attribute__((__packed__)); 29 | #ifndef NOSCALING 30 | struct exp_hdr { 31 | int16_t exp; /**< exponent */ 32 | }__attribute__((__packed__)); 33 | #endif 34 | #ifdef __cplusplus 35 | } 36 | #endif 37 | 38 | } // End namespace -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/params.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #include "params.hpp" 7 | #include 8 | 9 | namespace po = boost::program_options; 10 | 11 | namespace daiet { 12 | 13 | struct dpdk_data dpdk_data; 14 | struct dpdk_params dpdk_par; 15 | daiet_params daiet_par; 16 | 17 | void parse_parameters() { 18 | 19 | string config_file; 20 | ifstream ifs; 21 | uint16_t worker_port, ps_port; 22 | uint32_t num_updates; 23 | string worker_ip_str, ps_ips_str, ps_macs_str; 24 | 25 | po::options_description dpdk_options("DPDK options"); 26 | po::options_description daiet_options("DAIET options"); 27 | po::options_description config_file_options; 28 | 29 | dpdk_options.add_options() 30 | ("dpdk.cores", po::value(&dpdk_par.corestr)->default_value("0-2"), "List of cores") 31 | ("dpdk.prefix", po::value(&dpdk_par.prefix)->default_value("daiet"), "Process prefix") 32 | ("dpdk.extra_eal_options", po::value(&dpdk_par.eal_options)->default_value(""), "Extra EAL options") 33 | ("dpdk.port_id", po::value(&dpdk_par.portid)->default_value(0), "Port ID") 34 | ("dpdk.pool_size", po::value(&dpdk_par.pool_size)->default_value(8192 * 32), "Pool size") 35 | ("dpdk.pool_cache_size", po::value(&dpdk_par.pool_cache_size)->default_value(256 * 2), "Pool cache size") 36 | ("dpdk.burst_rx", po::value(&dpdk_par.burst_rx)->default_value(64), "RX burst size") 37 | ("dpdk.burst_tx", po::value(&dpdk_par.burst_tx)->default_value(64), "TX burst size") 38 | ("dpdk.bulk_drain_tx_us", po::value(&dpdk_par.bulk_drain_tx_us)->default_value(100), "TX bulk drain timer (us)"); 39 | 40 | daiet_options.add_options() 41 | ("daiet.worker_ip", po::value(&worker_ip_str)->default_value("10.0.0.1"), "IP address of this worker") 42 | ("daiet.worker_port", po::value(&worker_port)->default_value(4000), "Worker UDP port") 43 | ("daiet.ps_port", po::value(&ps_port)->default_value(48879), "PS UDP port") 44 | ("daiet.ps_ips", po::value(&ps_ips_str)->required(), "Comma-separated list of PS IP addresses") 45 | ("daiet.ps_macs", po::value(&ps_macs_str)->required(), "Comma-separated list of PS MAC addresses") 46 | ("daiet.max_num_pending_messages", po::value(&(daiet_par.getMaxNumPendingMessages()))->default_value(256), "Max number of pending, unaggregated messages") 47 | ("daiet.num_updates", po::value(&num_updates)->default_value(32), "Number of updates per packet") 48 | ("daiet.num_workers", po::value(&(daiet_par.getNumWorkers()))->default_value(0), "Number of workers") 49 | ("daiet.sync_blocks", po::value(&(daiet_par.getSyncBlocks()))->default_value(10), "Synchronization Blocks ") 50 | #ifdef TIMERS 51 | ("daiet.timeout", po::value(&(daiet_par.getTimeout()))->default_value(1), "Timeout in millisecond") 52 | #endif 53 | ; 54 | 55 | config_file_options.add(daiet_options).add(dpdk_options); 56 | 57 | config_file = "/etc/daiet.cfg"; 58 | ifs.open(config_file.c_str()); 59 | if(!ifs.good()){ 60 | ifs.close(); 61 | 62 | char hostname[500]; 63 | if (gethostname(hostname,sizeof(hostname))!=0) 64 | LOG_FATAL("gethostname failed: "+ string(strerror(errno))); 65 | 66 | config_file = "daiet-"+string(hostname)+".cfg"; 67 | ifs.open(config_file.c_str()); 68 | if(!ifs.good()){ 69 | ifs.close(); 70 | 71 | config_file = "daiet.cfg"; 72 | ifs.open(config_file.c_str()); 73 | if(!ifs.good()){ 74 | ifs.close(); 75 | LOG_FATAL("No config file found! (/etc/daiet.cfg, daiet-"+string(hostname)+".cfg, daiet.cfg)"); 76 | } 77 | } 78 | } 79 | LOG_INFO("Configuration file "+config_file); 80 | 81 | po::variables_map vm; 82 | po::store(po::parse_config_file(ifs, config_file_options), vm); 83 | po::notify(vm); 84 | 85 | if (!daiet_par.setWorkerIp(worker_ip_str)) 86 | LOG_FATAL("Invalid worker IP: " + worker_ip_str); 87 | 88 | daiet_par.setBaseWorkerPort(worker_port); 89 | daiet_par.setBasePsPort(ps_port); 90 | 91 | if (!daiet_par.setPs(ps_ips_str, ps_macs_str)) 92 | LOG_FATAL("Invalid PS address: \n" + ps_ips_str + "\n" + ps_macs_str); 93 | 94 | daiet_par.setNumUpdates(num_updates); 95 | 96 | if (daiet_par.getNumWorkers()<=0) 97 | LOG_FATAL("Number of workers must be positive."); 98 | daiet_par.print_params(); 99 | } 100 | 101 | void print_dpdk_params() { 102 | 103 | LOG_INFO("** DPDK parameters **"); 104 | LOG_INFO("Cores: " + dpdk_par.corestr); 105 | LOG_INFO("Port ID: " + to_string(dpdk_par.portid)); 106 | LOG_INFO("Port RX ring size: " + to_string(dpdk_par.port_rx_ring_size)); 107 | LOG_INFO("Port TX ring size: " + to_string(dpdk_par.port_tx_ring_size)); 108 | LOG_INFO("Pool size: " + to_string(dpdk_par.pool_size)); 109 | LOG_INFO("Pool cache size: " + to_string(dpdk_par.pool_cache_size)); 110 | LOG_INFO("Burst size RX: " + to_string(dpdk_par.burst_rx)); 111 | LOG_INFO("Burst size TX: " + to_string(dpdk_par.burst_tx)); 112 | LOG_INFO("Burst drain TX us: " + to_string(dpdk_par.bulk_drain_tx_us)); 113 | LOG_INFO("Prefix: " + dpdk_par.prefix); 114 | LOG_INFO("Extra EAL options: " + dpdk_par.eal_options); 115 | } 116 | 117 | daiet_params::daiet_params() { 118 | 119 | // Defaults 120 | num_updates = 32; 121 | 122 | max_num_pending_messages = 256; 123 | 124 | tx_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4 | PKT_TX_UDP_CKSUM; 125 | 126 | worker_port = 4000; 127 | ps_port = 48879; 128 | worker_ip_be = rte_cpu_to_be_32(0x0a000001); 129 | 130 | ps_ips_be = NULL; 131 | 132 | ps_macs_be = NULL; 133 | 134 | num_ps = 0; 135 | 136 | num_workers = 0; 137 | } 138 | 139 | daiet_params::~daiet_params() { 140 | if (ps_ips_be != NULL) 141 | delete[] ps_ips_be; 142 | if (ps_macs_be != NULL) 143 | delete[] ps_macs_be; 144 | } 145 | 146 | void daiet_params::print_params() { 147 | 148 | LOG_INFO("** DAIET parameters **"); 149 | LOG_INFO("Num updates: " + to_string(num_updates)); 150 | LOG_INFO("Max num pending messages: " + to_string(max_num_pending_messages)); 151 | LOG_INFO("Worker port: " + to_string(worker_port)); 152 | LOG_INFO("PS port: " + to_string(ps_port)); 153 | 154 | LOG_INFO("Worker IP: " + ip_to_str(worker_ip_be)); 155 | 156 | for (uint32_t i = 0; i < num_ps; i++) { 157 | 158 | LOG_INFO("PS" + to_string(i) + ": " + mac_to_str(ps_macs_be[i]) + " " + ip_to_str(ps_ips_be[i])); 159 | } 160 | 161 | LOG_INFO("Num workers: " + to_string(num_workers)); 162 | } 163 | 164 | uint16_t& daiet_params::getNumWorkers() { 165 | return num_workers; 166 | } 167 | 168 | uint32_t& daiet_params::getSyncBlocks() { 169 | return sync_blocks; 170 | } 171 | 172 | void daiet_params::setNumUpdates(uint32_t numUpdates) { 173 | num_updates = numUpdates; 174 | } 175 | 176 | void daiet_params::setBaseWorkerPort(uint16_t workerPort) { 177 | worker_port = workerPort; 178 | } 179 | 180 | void daiet_params::setBasePsPort(uint16_t psPort) { 181 | ps_port = psPort; 182 | 183 | } 184 | 185 | /* 186 | * Returns false if the IP is invalid 187 | */ 188 | bool daiet_params::setWorkerIp(string workerIp) { 189 | 190 | struct in_addr addr; 191 | 192 | if (inet_aton(workerIp.c_str(), &addr) == 0) 193 | return false; 194 | 195 | worker_ip_be = addr.s_addr; 196 | return true; 197 | } 198 | 199 | bool daiet_params::setPs(string psIps, string psMacs) { 200 | 201 | int64_t rc; 202 | 203 | vector ips = split(psIps, ", "); 204 | vector macs = split(psMacs, ", "); 205 | 206 | num_ps = ips.size() < macs.size() ? ips.size() : macs.size(); 207 | 208 | if (ps_ips_be != NULL) 209 | delete[] ps_ips_be; 210 | if (ps_macs_be != NULL) 211 | delete[] ps_macs_be; 212 | 213 | ps_ips_be = new uint32_t[num_ps]; 214 | ps_macs_be = new uint64_t[num_ps]; 215 | 216 | struct in_addr addr; 217 | 218 | for (uint32_t i = 0; i < num_ps; i++) { 219 | 220 | if (inet_aton(ips[i].c_str(), &addr) == 0) 221 | return false; 222 | 223 | ps_ips_be[i] = addr.s_addr; 224 | 225 | rc = str_to_mac(macs[i]); 226 | if (rc < 0) 227 | return false; 228 | 229 | ps_macs_be[i] = rc; 230 | } 231 | 232 | return true; 233 | } 234 | } -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/params.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "common.hpp" 9 | #include "utils.hpp" 10 | 11 | using namespace std; 12 | 13 | namespace daiet { 14 | 15 | void print_dpdk_params(); 16 | void parse_parameters(); 17 | 18 | struct dpdk_data { 19 | 20 | // Buffer pool size 21 | uint32_t pool_buffer_size; 22 | uint16_t core_to_thread_id[RTE_MAX_LCORE]; 23 | 24 | dpdk_data() { 25 | // Defaults 26 | 27 | pool_buffer_size = RTE_MBUF_DEFAULT_BUF_SIZE; 28 | } 29 | }__rte_cache_aligned; 30 | 31 | extern struct dpdk_data dpdk_data; 32 | 33 | struct dpdk_params { 34 | 35 | // Ports 36 | uint16_t portid; 37 | uint16_t port_rx_ring_size; 38 | uint16_t port_tx_ring_size; 39 | 40 | // Buffer pool 41 | uint32_t pool_size; 42 | uint32_t pool_cache_size; 43 | 44 | // Burst sizes 45 | uint32_t burst_rx; 46 | uint32_t burst_tx; 47 | uint32_t bulk_drain_tx_us; 48 | 49 | // Extra EAL options 50 | string eal_options; 51 | 52 | // Process prefix 53 | string prefix; 54 | 55 | // Cores string 56 | string corestr; 57 | 58 | dpdk_params() { 59 | // Defaults 60 | 61 | portid = 0; 62 | port_rx_ring_size = 1024; 63 | port_tx_ring_size = 1024; 64 | 65 | pool_size = 8192 * 32; 66 | pool_cache_size = 256 * 2; 67 | 68 | burst_rx = 64; 69 | burst_tx = 64; 70 | bulk_drain_tx_us = 10; 71 | 72 | prefix = "daiet"; 73 | eal_options = ""; 74 | 75 | corestr = ""; 76 | } 77 | }__rte_cache_aligned; 78 | 79 | extern struct dpdk_params dpdk_par; 80 | 81 | class daiet_params { 82 | private: 83 | 84 | uint32_t num_updates; 85 | 86 | uint32_t max_num_pending_messages; 87 | 88 | uint64_t tx_flags; 89 | 90 | uint16_t worker_port; 91 | uint16_t ps_port; 92 | uint32_t worker_ip_be; 93 | 94 | uint32_t* ps_ips_be; 95 | 96 | uint64_t* ps_macs_be; 97 | 98 | uint32_t num_ps; 99 | 100 | #ifdef TIMERS 101 | double timeout; 102 | #endif 103 | 104 | uint16_t num_workers; 105 | 106 | uint32_t sync_blocks; 107 | 108 | public: 109 | daiet_params(); 110 | ~daiet_params(); 111 | 112 | void print_params(); 113 | 114 | uint16_t& getNumWorkers(); 115 | 116 | uint32_t& getSyncBlocks(); 117 | 118 | __rte_always_inline uint32_t getNumUpdates() const { 119 | return num_updates; 120 | } 121 | 122 | __rte_always_inline uint32_t& getMaxNumPendingMessages() { 123 | return max_num_pending_messages; 124 | } 125 | 126 | void setNumUpdates(uint32_t); 127 | 128 | __rte_always_inline int64_t getTxFlags() const { 129 | return tx_flags; 130 | } 131 | 132 | __rte_always_inline uint16_t getBaseWorkerPort() const { 133 | return worker_port; 134 | } 135 | 136 | void setBaseWorkerPort(uint16_t workerPort); 137 | 138 | __rte_always_inline uint16_t getBasePsPort() const { 139 | return ps_port; 140 | } 141 | 142 | void setBasePsPort(uint16_t); 143 | 144 | /* 145 | * Returns false if the IP is invalid 146 | */ 147 | bool setWorkerIp(string); 148 | 149 | __rte_always_inline uint32_t getWorkerIpBe() { 150 | return worker_ip_be; 151 | } 152 | 153 | __rte_always_inline const uint32_t* getPsIpsBe() { 154 | return ps_ips_be; 155 | } 156 | 157 | __rte_always_inline const uint64_t* getPsMacsBe() { 158 | return ps_macs_be; 159 | } 160 | 161 | __rte_always_inline uint32_t getPsIpBe(int i) { 162 | return ps_ips_be[i % num_ps]; 163 | } 164 | 165 | __rte_always_inline uint64_t getPsMacBe(int i) { 166 | return ps_macs_be[i % num_ps]; 167 | } 168 | 169 | bool setPs(string, string); 170 | 171 | __rte_always_inline uint32_t getNumPs() const { 172 | return num_ps; 173 | } 174 | 175 | #ifdef TIMERS 176 | __rte_always_inline double& getTimeout() { 177 | return timeout; 178 | } 179 | #endif 180 | }; 181 | 182 | extern daiet_params daiet_par; 183 | } -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/ps.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #ifdef COLOCATED 7 | #include "ps.hpp" 8 | #include "common.hpp" 9 | #include "utils.hpp" 10 | #include "params.hpp" 11 | #include "stats.hpp" 12 | 13 | using namespace std; 14 | 15 | namespace daiet { 16 | 17 | struct mac_ip_pair { 18 | struct rte_ether_addr mac; 19 | uint32_t be_ip; 20 | }; 21 | 22 | thread_local static uint32_t num_updates; 23 | thread_local static mac_ip_pair* ps_workers_ip_to_mac; 24 | thread_local static uint32_t known_workers = 0; 25 | 26 | thread_local static int32_t** ps_aggregated_messages; 27 | thread_local static uint32_t* ps_received_message_counters; 28 | 29 | thread_local static uint16_t ps_port_be; 30 | 31 | #ifdef DEBUG 32 | __rte_always_inline struct daiet_hdr * is_daiet_pkt_to_ps(struct rte_ether_hdr* eth_hdr, uint16_t size) { 33 | 34 | int idx; 35 | uint16_t etherType; 36 | struct rte_ipv4_hdr* ip_hdr; 37 | struct rte_udp_hdr* rte_udp_hdr; 38 | 39 | idx = sizeof(struct rte_ether_hdr); 40 | etherType = rte_be_to_cpu_16(eth_hdr->ether_type); 41 | 42 | if (etherType == RTE_ETHER_TYPE_IPV4 && size >= idx + sizeof(struct rte_ipv4_hdr)) { 43 | 44 | idx += sizeof(struct rte_ipv4_hdr); 45 | ip_hdr = (struct rte_ipv4_hdr *) (eth_hdr + 1); 46 | 47 | if (ip_hdr->next_proto_id == IPPROTO_UDP && size >= idx + sizeof(struct rte_udp_hdr)) { 48 | idx += sizeof(struct rte_udp_hdr); 49 | rte_udp_hdr = (struct rte_udp_hdr *) (ip_hdr + 1); 50 | 51 | if (rte_udp_hdr->dst_port == ps_port_be && size >= idx + sizeof(struct daiet_hdr)) { 52 | 53 | return (struct daiet_hdr *) (rte_udp_hdr + 1); 54 | } 55 | } 56 | } 57 | return NULL; 58 | } 59 | #endif 60 | 61 | __rte_always_inline void ps_msg_setup(struct daiet_hdr * daiet, uint16_t pool_index) { 62 | 63 | struct entry_hdr *entry; 64 | int32_t* base_ptr = ps_aggregated_messages[pool_index]; 65 | 66 | entry = (struct entry_hdr *) (daiet + 1); 67 | for (uint32_t i = 0; i < num_updates; i++, entry++) { 68 | entry->upd = rte_cpu_to_be_32(base_ptr[i]); 69 | base_ptr[i] = 0; 70 | } 71 | } 72 | 73 | /* Returns true if the aggregation for the offset is complete */ 74 | __rte_always_inline bool ps_aggregate_message(struct daiet_hdr* daiet, uint32_t be_src_ip, struct rte_ether_addr src_mac, uint16_t pool_index, uint16_t num_workers) { 75 | 76 | struct entry_hdr * entry = (struct entry_hdr *) (daiet + 1); 77 | int32_t* base_ptr = ps_aggregated_messages[pool_index]; 78 | 79 | for (uint32_t i = 0; i < num_updates; i++, entry++) { 80 | base_ptr[i] += rte_be_to_cpu_32(entry->upd); 81 | } 82 | 83 | if (unlikely(known_workers < num_workers)) { 84 | 85 | bool found = false; 86 | 87 | for (uint32_t i = 0; i < known_workers && !found; i++) { 88 | 89 | if (ps_workers_ip_to_mac[i].be_ip==be_src_ip) 90 | found = true; 91 | } 92 | 93 | if (!found) { 94 | 95 | // New worker 96 | char ipstring[INET_ADDRSTRLEN]; 97 | 98 | if (unlikely(inet_ntop(AF_INET, &be_src_ip, ipstring, INET_ADDRSTRLEN) == NULL)) { 99 | LOG_FATAL("Wrong IP: error " + to_string(errno)); 100 | } 101 | 102 | LOG_INFO("Worker: " + string(ipstring) + " " + mac_to_str(src_mac)); 103 | 104 | ps_workers_ip_to_mac[known_workers].mac = src_mac; 105 | ps_workers_ip_to_mac[known_workers].be_ip = be_src_ip; 106 | known_workers++; 107 | } 108 | } 109 | 110 | ps_received_message_counters[pool_index]--; 111 | 112 | if (unlikely(ps_received_message_counters[pool_index]==0)) { 113 | ps_received_message_counters[pool_index] = num_workers; 114 | return true; 115 | } 116 | 117 | return false; 118 | } 119 | 120 | void ps_setup() { 121 | } 122 | 123 | void ps_cleanup() { 124 | } 125 | 126 | int ps(void* num_worker_threads) { 127 | 128 | int ret; 129 | 130 | unsigned lcore_id; 131 | unsigned nb_rx = 0, j = 0, i = 0, nb_tx = 0, sent = 0; 132 | 133 | uint16_t ps_id, id_shift = *((uint16_t*)(num_worker_threads)); 134 | uint16_t num_workers = daiet_par.getNumWorkers(); 135 | const uint32_t max_num_pending_messages = daiet_par.getMaxNumPendingMessages(); 136 | num_updates = daiet_par.getNumUpdates(); 137 | uint64_t ps_tx = 0, ps_rx = 0; 138 | 139 | struct rte_mempool *pool; 140 | string pool_name = "ps_pool"; 141 | struct rte_mbuf** pkts_burst; 142 | struct rte_mbuf* m; 143 | struct rte_mbuf** clone_burst; 144 | 145 | struct rte_ether_hdr* eth; 146 | struct rte_ipv4_hdr * ip; 147 | struct rte_udp_hdr * udp; 148 | struct daiet_hdr* daiet; 149 | uint16_t pool_index = 0, start_pool_index = 0; 150 | 151 | // Get core ID 152 | lcore_id = rte_lcore_id(); 153 | ps_id = dpdk_data.core_to_thread_id[lcore_id]; 154 | LOG_DEBUG("PS core: " + to_string(lcore_id) + " PS id: " + to_string(ps_id)); 155 | 156 | start_pool_index = (ps_id - id_shift) * max_num_pending_messages; 157 | ps_port_be = rte_cpu_to_be_16(daiet_par.getBasePsPort() + ps_id - id_shift); 158 | 159 | ps_aggregated_messages = (int32_t**) rte_malloc_socket(NULL, max_num_pending_messages * sizeof(int32_t*), RTE_CACHE_LINE_SIZE, rte_socket_id()); 160 | if (ps_aggregated_messages == NULL) 161 | LOG_FATAL("Failed PS aggregated messages allocation!"); 162 | 163 | for (i = 0; i < max_num_pending_messages; i++) { 164 | ps_aggregated_messages[i] = (int32_t*) rte_zmalloc_socket(NULL, num_updates * sizeof(int32_t), RTE_CACHE_LINE_SIZE, rte_socket_id()); 165 | if (ps_aggregated_messages[i] == NULL) 166 | LOG_FATAL("Failed PS aggregated messages allocation: element " + to_string(i)); 167 | } 168 | 169 | ps_received_message_counters = (uint32_t*) rte_zmalloc_socket(NULL, max_num_pending_messages * sizeof(uint32_t), RTE_CACHE_LINE_SIZE, rte_socket_id()); 170 | if (ps_received_message_counters == NULL) 171 | LOG_FATAL("Failed PS aggregated messages allocation!"); 172 | 173 | for (i = 0; i < max_num_pending_messages; i++) { 174 | ps_received_message_counters[i] = num_workers; 175 | } 176 | 177 | ps_workers_ip_to_mac = (mac_ip_pair*) rte_zmalloc_socket(NULL, num_workers * sizeof(struct mac_ip_pair), RTE_CACHE_LINE_SIZE, rte_socket_id()); 178 | if (ps_workers_ip_to_mac == NULL) 179 | LOG_FATAL("PS thread: cannot allocate ps_workers_ip_to_mac"); 180 | 181 | pkts_burst = (rte_mbuf **) rte_malloc_socket(NULL, dpdk_par.burst_rx * sizeof(struct rte_mbuf*), RTE_CACHE_LINE_SIZE, rte_socket_id()); 182 | if (pkts_burst == NULL) 183 | LOG_FATAL("PS thread: cannot allocate pkts burst"); 184 | 185 | clone_burst = (rte_mbuf **) rte_malloc_socket(NULL, num_workers * sizeof(struct rte_mbuf*), RTE_CACHE_LINE_SIZE, rte_socket_id()); 186 | if (clone_burst == NULL) 187 | LOG_FATAL("PS thread: cannot allocate clone burst"); 188 | 189 | // Init the buffer pool 190 | pool_name = pool_name + to_string(ps_id); 191 | pool = rte_pktmbuf_pool_create(pool_name.c_str(), dpdk_par.pool_size, dpdk_par.pool_cache_size, 0, dpdk_data.pool_buffer_size, rte_socket_id()); 192 | if (pool == NULL) 193 | LOG_FATAL("Cannot init mbuf pool: " + string(rte_strerror(rte_errno))); 194 | 195 | while (!force_quit) { 196 | 197 | nb_rx = rte_eth_rx_burst(dpdk_par.portid, ps_id, pkts_burst, dpdk_par.burst_rx); 198 | 199 | for (j = 0; j < nb_rx; j++) { 200 | 201 | m = pkts_burst[j]; 202 | 203 | rte_prefetch0 (rte_pktmbuf_mtod(m, void *)); 204 | eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 205 | 206 | #ifdef DEBUG 207 | daiet = is_daiet_pkt_to_ps(eth, m->data_len); 208 | if (likely(daiet != NULL)) { 209 | #else 210 | daiet = (struct daiet_hdr *) ((uint8_t *) (eth+1) + sizeof(struct rte_ipv4_hdr) + sizeof(struct rte_udp_hdr)); 211 | #endif 212 | 213 | ps_rx++; 214 | ip = (struct rte_ipv4_hdr *) (eth + 1); 215 | udp = (struct rte_udp_hdr *) (ip + 1); 216 | 217 | pool_index = (rte_be_to_cpu_16(daiet->pool_index) & 0x7FFF) - start_pool_index; 218 | 219 | if (ps_aggregate_message(daiet, ip->src_addr, eth->s_addr, pool_index, num_workers)) { 220 | 221 | // Checksum offload 222 | m->l2_len = sizeof(struct rte_ether_hdr); 223 | m->l3_len = sizeof(struct rte_ipv4_hdr); 224 | m->ol_flags |= daiet_par.getTxFlags(); 225 | 226 | // Set src MAC 227 | rte_ether_addr_copy(&(eth->d_addr), &(eth->s_addr)); 228 | 229 | // Set src IP 230 | ip->hdr_checksum = 0; 231 | ip->src_addr = ip->dst_addr; 232 | 233 | // Swap ports 234 | swap((uint16_t&) (udp->dst_port), (uint16_t&) (udp->src_port)); 235 | udp->dgram_cksum = rte_ipv4_phdr_cksum(ip, m->ol_flags); 236 | 237 | ps_msg_setup(daiet, pool_index); 238 | 239 | // Allocate pkt burst 240 | ret = rte_pktmbuf_alloc_bulk(pool, clone_burst, num_workers); 241 | if (unlikely(ret < 0)) 242 | LOG_FATAL("Cannot allocate clone burst"); 243 | 244 | for (i = 0; i < num_workers; i++) { 245 | 246 | // Clone packet 247 | deep_copy_single_segment_pkt(clone_burst[i], m); 248 | 249 | eth = rte_pktmbuf_mtod(clone_burst[i], struct rte_ether_hdr *); 250 | 251 | // Set dst MAC 252 | rte_ether_addr_copy(&(ps_workers_ip_to_mac[i].mac), &(eth->d_addr)); 253 | 254 | // Set dst IP 255 | ip = (struct rte_ipv4_hdr *) (eth + 1); 256 | ip->dst_addr = ps_workers_ip_to_mac[i].be_ip; 257 | } 258 | 259 | // Send packet burst 260 | sent = 0; 261 | do { 262 | nb_tx = rte_eth_tx_burst(dpdk_par.portid, ps_id,clone_burst, num_workers); 263 | 264 | sent += nb_tx; 265 | } while (sent < num_workers); 266 | 267 | ps_tx += num_workers; 268 | 269 | // Free original packet 270 | rte_pktmbuf_free(m); 271 | 272 | } else { 273 | // Free original packet 274 | rte_pktmbuf_free(m); 275 | } 276 | #ifdef DEBUG 277 | } else { 278 | // Free original packet 279 | rte_pktmbuf_free(m); 280 | } 281 | #endif 282 | } 283 | } 284 | 285 | // Set stats 286 | pkt_stats.set_ps(ps_id - id_shift, ps_tx, ps_rx); 287 | 288 | // Cleanup 289 | rte_free(clone_burst); 290 | rte_free(pkts_burst); 291 | 292 | rte_free(ps_workers_ip_to_mac); 293 | 294 | rte_free(ps_received_message_counters); 295 | 296 | for (uint32_t i = 0; i < max_num_pending_messages; i++) { 297 | rte_free(ps_aggregated_messages[i]); 298 | } 299 | 300 | rte_free(ps_aggregated_messages); 301 | 302 | return 0; 303 | } 304 | } 305 | #endif 306 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/ps.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #ifdef COLOCATED 7 | #pragma once 8 | 9 | namespace daiet { 10 | 11 | void ps_setup(); 12 | void ps_cleanup(); 13 | int ps(void*); 14 | } 15 | #endif 16 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/stats.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #include "stats.hpp" 7 | #include "utils.hpp" 8 | 9 | namespace daiet { 10 | 11 | pkt_statistics pkt_stats; 12 | 13 | pkt_statistics::pkt_statistics() : total_w_tx(0), total_w_rx(0), total_w_unsent(0) { 14 | #ifdef COLOCATED 15 | total_ps_tx = 0; 16 | total_ps_rx = 0; 17 | #endif 18 | } 19 | 20 | #ifndef COLOCATED 21 | void pkt_statistics::init(uint32_t nb_w) { 22 | #else 23 | void pkt_statistics::init(uint32_t nb_w, uint32_t nb_ps) { 24 | #endif 25 | 26 | total_w_tx = 0; 27 | total_w_rx = 0; 28 | total_w_unsent = 0; 29 | 30 | w_tx.resize(nb_w); 31 | w_rx.resize(nb_w); 32 | w_unsent.resize(nb_w); 33 | 34 | #ifdef COLOCATED 35 | total_ps_tx = 0; 36 | total_ps_rx = 0; 37 | 38 | ps_tx.resize(nb_ps); 39 | ps_rx.resize(nb_ps); 40 | #endif 41 | 42 | #ifdef TIMERS 43 | w_timeouts.resize(nb_w); 44 | #endif 45 | } 46 | 47 | void pkt_statistics::set_workers(uint16_t wid, uint64_t tx, uint64_t rx, uint64_t unsent) { 48 | 49 | boost::unique_lock lock(w_mutex); 50 | 51 | w_tx[wid] = tx; 52 | w_rx[wid] = rx; 53 | w_unsent[wid] = unsent; 54 | 55 | total_w_tx += tx; 56 | total_w_rx += rx; 57 | total_w_unsent += unsent; 58 | } 59 | 60 | #ifdef COLOCATED 61 | void pkt_statistics::set_ps(uint32_t psid, uint64_t tx, uint64_t rx) { 62 | 63 | boost::unique_lock lock(ps_mutex); 64 | 65 | ps_tx[psid] = tx; 66 | ps_rx[psid] = rx; 67 | 68 | total_ps_tx += tx; 69 | total_ps_rx += rx; 70 | } 71 | #endif 72 | 73 | #ifdef TIMERS 74 | void pkt_statistics::set_timeouts(uint32_t wid, uint64_t timeouts) { 75 | 76 | boost::unique_lock lock(timeouts_mutex); 77 | 78 | w_timeouts[wid] = timeouts; 79 | 80 | total_timeouts += timeouts; 81 | } 82 | #endif 83 | 84 | void pkt_statistics::dump(){ 85 | 86 | #ifndef COLOCATED 87 | LOG_INFO("TX " + to_string(total_w_tx)); 88 | LOG_INFO("RX " + to_string(total_w_rx)); 89 | LOG_INFO("UNSENT " + to_string(total_w_unsent)); 90 | #else 91 | LOG_INFO("Worker TX " + to_string(total_w_tx)); 92 | LOG_INFO("Worker RX " + to_string(total_w_rx)); 93 | LOG_INFO("Worker UNSENT " + to_string(total_w_unsent)); 94 | LOG_INFO("PS TX " + to_string(total_ps_tx)); 95 | LOG_INFO("PS RX " + to_string(total_ps_rx)); 96 | #endif 97 | 98 | #ifdef TIMERS 99 | LOG_INFO("Timeouts " + to_string(total_timeouts)); 100 | #endif 101 | 102 | for (uint32_t i = 0; i < w_tx.size(); i++) { 103 | 104 | LOG_INFO("## Worker " + to_string(i)); 105 | LOG_INFO("TX " + to_string(w_tx[i])); 106 | LOG_INFO("RX " + to_string(w_rx[i])); 107 | LOG_INFO("UNSENT " + to_string(w_unsent[i])); 108 | #ifdef TIMERS 109 | LOG_INFO("Timeouts " + to_string(w_timeouts[i])); 110 | #endif 111 | } 112 | 113 | #ifdef COLOCATED 114 | for (uint32_t i = 0; i < ps_tx.size(); i++) { 115 | 116 | LOG_INFO("## PS" + to_string(i)); 117 | LOG_INFO("TX " + to_string(ps_tx[i])); 118 | LOG_INFO("RX " + to_string(ps_rx[i])); 119 | } 120 | #endif 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/stats.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | 11 | #include "common.hpp" 12 | 13 | using namespace std; 14 | 15 | namespace daiet { 16 | 17 | class pkt_statistics { 18 | 19 | public: 20 | pkt_statistics(); 21 | void set_workers(uint16_t, uint64_t, uint64_t, uint64_t); 22 | void dump(); 23 | 24 | #ifndef COLOCATED 25 | void init(uint32_t); 26 | #else 27 | void init(uint32_t, uint32_t); 28 | void set_ps(uint32_t, uint64_t, uint64_t); 29 | #endif 30 | 31 | #ifdef TIMERS 32 | void set_timeouts(uint32_t, uint64_t); 33 | #endif 34 | 35 | private: 36 | 37 | boost::mutex w_mutex; 38 | uint64_t total_w_tx; 39 | uint64_t total_w_rx; 40 | uint64_t total_w_unsent; 41 | vector w_tx; 42 | vector w_rx; 43 | vector w_unsent; 44 | 45 | #ifdef COLOCATED 46 | boost::mutex ps_mutex; 47 | uint64_t total_ps_tx; 48 | uint64_t total_ps_rx; 49 | vector ps_tx; 50 | vector ps_rx; 51 | #endif 52 | 53 | #ifdef TIMERS 54 | boost::mutex timeouts_mutex; 55 | vector w_timeouts; 56 | uint64_t total_timeouts; 57 | #endif 58 | }; 59 | 60 | extern pkt_statistics pkt_stats; 61 | } 62 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/utils.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "common.hpp" 18 | 19 | using namespace std; 20 | 21 | namespace daiet { 22 | 23 | extern std::ofstream daiet_log; 24 | 25 | template 26 | void LOG_FATAL(T) __attribute__((used)); 27 | template 28 | void LOG_ERROR(T) __attribute__((used)); 29 | template 30 | void LOG_INFO(T) __attribute__((used)); 31 | 32 | #ifdef DEBUG 33 | template 34 | void LOG_DEBUG(T) __attribute__((used)); 35 | #else 36 | #define LOG_DEBUG(T) 37 | #endif 38 | 39 | template 40 | string to_hex(T); 41 | 42 | vector split(const string &); 43 | vector split(const string &, const string &); 44 | 45 | string mac_to_str(const rte_ether_addr); 46 | string mac_to_str(const uint64_t, bool = true); 47 | int64_t str_to_mac(string const&, bool = true); 48 | string ip_to_str(uint32_t); 49 | 50 | void swap_eth_addr(rte_ether_hdr *); 51 | void deep_copy_single_segment_pkt(rte_mbuf*, const rte_mbuf*); 52 | void check_port_link_status(uint16_t); 53 | void print_packet(struct rte_ether_hdr *, uint16_t); 54 | void print_dev_info(struct rte_eth_dev_info&); 55 | void print_dev_stats(uint16_t); 56 | void print_dev_xstats(uint16_t); 57 | 58 | } // End namespace 59 | -------------------------------------------------------------------------------- /omnireduce-DPDK/daiet/src/worker.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * DAIET project 3 | * author: amedeo.sapio@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "common.hpp" 9 | 10 | namespace daiet { 11 | 12 | void worker_setup(); 13 | void worker_cleanup(); 14 | int worker(void*); 15 | } 16 | -------------------------------------------------------------------------------- /omnireduce-DPDK/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04 2 | RUN apt-get update && \ 3 | DEBIAN_FRONTEND="noninteractive" apt-get install -qy \ 4 | autotools-dev \ 5 | bison \ 6 | build-essential \ 7 | ca-certificates \ 8 | chrpath \ 9 | coreutils \ 10 | debhelper \ 11 | dh-python \ 12 | dpatch \ 13 | ethtool \ 14 | flex \ 15 | gcc \ 16 | gfortran \ 17 | git \ 18 | graphviz \ 19 | iproute2 \ 20 | kmod \ 21 | libboost-program-options-dev \ 22 | libboost-chrono-dev \ 23 | libboost-system-dev \ 24 | libboost-thread-dev \ 25 | libc6-dev \ 26 | libelf1 \ 27 | libgfortran3 \ 28 | libglib2.0-0 \ 29 | libhiredis-dev \ 30 | libjpeg-dev \ 31 | libltdl-dev \ 32 | libmnl-dev \ 33 | libnl-3-200 \ 34 | libnl-3-dev \ 35 | libnl-route-3-200 \ 36 | libnl-route-3-dev \ 37 | libnuma-dev \ 38 | libnuma1 \ 39 | libpng-dev \ 40 | libpython3-dev \ 41 | libssl1.0.0 \ 42 | linux-headers-$(uname -r) \ 43 | linux-modules-$(uname -r) \ 44 | lsb-release \ 45 | lsof \ 46 | m4 \ 47 | net-tools \ 48 | openssh-client \ 49 | openssh-server \ 50 | pciutils \ 51 | perl \ 52 | pkg-config \ 53 | python3 \ 54 | python3-dev \ 55 | python3-distutils \ 56 | swig \ 57 | tk \ 58 | udev \ 59 | vim \ 60 | wget && rm -rf /var/lib/apt/lists/* 61 | 62 | 63 | # Allow OpenSSH to talk to containers without asking for confirmation 64 | RUN mkdir -p /var/run/sshd && cat /etc/ssh/ssh_config | grep -v 'StrictHostKeyChecking' > /etc/ssh/ssh_config.new && \ 65 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 66 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 67 | 68 | # MLNX driver 69 | ARG MOFED_VER=5.3-1.0.0.1 70 | RUN mkdir -p /tmp/mofed && cd /tmp/mofed && \ 71 | wget http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-ubuntu18.04-$(uname -m).tgz && \ 72 | tar -xzvf *.tgz && \ 73 | */mlnxofedinstall --user-space-only --without-fw-update --upstream-libs --dpdk --force && \ 74 | cd /tmp && \ 75 | rm -rf mofed 76 | 77 | # mamba 78 | RUN cd ~ && \ 79 | wget -O Mambaforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh && \ 80 | bash Mambaforge.sh -b && \ 81 | /root/mambaforge/bin/mamba install \ 82 | pip \ 83 | python=3.7.*=*_cpython \ 84 | cudnn=7.6 \ 85 | nccl=2.4 \ 86 | cudatoolkit \ 87 | jupyter \ 88 | matplotlib \ 89 | astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses \ 90 | magma-cuda101 -y -c pytorch && \ 91 | rm Mambaforge.sh 92 | 93 | # Install Open MPI 94 | RUN mkdir /tmp/openmpi && \ 95 | cd /tmp/openmpi && \ 96 | wget -q https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.1.tar.gz && \ 97 | tar zxf openmpi-4.1.1.tar.gz && \ 98 | cd openmpi-4.1.1 && \ 99 | ./configure --enable-orterun-prefix-by-default && \ 100 | make -j $(nproc) all && \ 101 | make install && \ 102 | ldconfig && \ 103 | rm -rf /tmp/openmpi 104 | 105 | # Create a wrapper for OpenMPI to allow running as root by default 106 | # Configure OpenMPI to run good defaults: 107 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 108 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 109 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 110 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 111 | chmod a+x /usr/local/bin/mpirun && \ 112 | echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 113 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ 114 | echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf 115 | 116 | SHELL ["/root/mambaforge/bin/conda", "run", "--no-capture-output", "-n", "base", "/bin/bash", "-c"] 117 | ENV PATH="/root/mambaforge/bin:/root/mambaforge/condabin:${PATH}" 118 | ENV CPLUS_INCLUDE_PATH=/root/mambaforge/include LIBRARY_PATH=/root/mambaforge/lib LD_LIBRARY_PATH=/root/mambaforge/lib 119 | ARG TORCH_CUDA_ARCH_LIST 120 | RUN cd ~ && git clone --branch docker --depth 1 https://github.com/ChenYuHo/omnireduce.git && cd omnireduce && ./prepare.sh --depth 1 && \ 121 | ./build_all.sh INSTALL MLX5 TIMERS CONDA OFFLOAD_BITMAP NOSCALING PYTORCH ALGO2 122 | 123 | ARG EXPS_BASE_PATH=/root 124 | ARG EXPS_PATH=$EXPS_BASE_PATH/exps 125 | ARG EXPS_GIT_LINK=https://github.com/Phlix1/exps.git 126 | 127 | RUN cd $EXPS_BASE_PATH && git clone $EXPS_GIT_LINK 128 | 129 | #For benchmark 130 | RUN cd $EXPS_PATH/benchmark && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg 131 | 132 | #For DeepLight 133 | RUN mamba install scikit-learn python=3.7.*=*_cpython -y 134 | RUN cd $EXPS_PATH/models/DeepLight && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg 135 | 136 | #For LSTM 137 | RUN mamba install cython python=3.7.*=*_cpython -y 138 | RUN cd $EXPS_PATH/models/LSTM/lm/log_uniform && make && python setup.py install 139 | RUN cd $EXPS_PATH/models/LSTM && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg 140 | 141 | #For NCF 142 | RUN mamba install numpy-indexed python=3.7.*=*_cpython -y 143 | RUN pip install mlperf_compliance 144 | RUN cd $EXPS_PATH/models/NCF && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg 145 | 146 | #For CNN 147 | RUN mamba install pillow python=3.7.*=*_cpython -y 148 | RUN mamba install torchvision=0.8.0 python=3.7.*=*_cpython -c pytorch --no-deps -y 149 | RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda100 150 | RUN cd /usr/local && git clone https://github.com/NVIDIA/apex && cd apex && git reset --hard a651e2c24ecf97cbf367fd3f330df36760e1c597 && \ 151 | pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 152 | RUN cd $EXPS_PATH/models/CNN && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg 153 | 154 | 155 | #For BERT 156 | RUN pip install nvidia-pyindex 157 | RUN pip install nvidia-dllogger 158 | RUN mamba install unzip -y 159 | RUN cd $EXPS_PATH/models/BERT/dataset/checkpoint && \ 160 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_qa_squad11_amp/versions/19.09.0/zip -O bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip && \ 161 | unzip bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip 162 | RUN cd $EXPS_PATH/models/BERT && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg && mkdir results 163 | 164 | ARG OMNIREDUCE_CONTAINER_PORT=2222 165 | ENV OMNIREDUCE_CONTAINER_PORT ${OMNIREDUCE_CONTAINER_PORT} 166 | -------------------------------------------------------------------------------- /omnireduce-DPDK/docker/aggregator_Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | RUN apt-get update && \ 3 | DEBIAN_FRONTEND=noninteractive apt-get install -qy \ 4 | autotools-dev \ 5 | bison \ 6 | build-essential \ 7 | ca-certificates \ 8 | chrpath \ 9 | coreutils \ 10 | debhelper \ 11 | dh-python \ 12 | dpatch \ 13 | ethtool \ 14 | flex \ 15 | gcc \ 16 | gfortran \ 17 | git \ 18 | graphviz \ 19 | iproute2 \ 20 | kmod \ 21 | libboost-program-options-dev \ 22 | libboost-chrono-dev \ 23 | libboost-system-dev \ 24 | libboost-thread-dev \ 25 | libc6-dev \ 26 | libelf1 \ 27 | libgfortran3 \ 28 | libglib2.0-0 \ 29 | libhiredis-dev \ 30 | libjpeg-dev \ 31 | libltdl-dev \ 32 | libmnl-dev \ 33 | libnl-3-200 \ 34 | libnl-3-dev \ 35 | libnl-route-3-200 \ 36 | libnl-route-3-dev \ 37 | libnuma-dev \ 38 | libnuma1 \ 39 | libpng-dev \ 40 | libpython3-dev \ 41 | libssl1.0.0 \ 42 | linux-headers-$(uname -r) \ 43 | linux-modules-$(uname -r) \ 44 | lsb-release \ 45 | lsof \ 46 | m4 \ 47 | net-tools \ 48 | openssh-client \ 49 | openssh-server \ 50 | pciutils \ 51 | perl \ 52 | pkg-config \ 53 | python3 \ 54 | python3-dev \ 55 | python3-distutils \ 56 | swig \ 57 | tk \ 58 | udev \ 59 | vim \ 60 | wget && rm -rf /var/lib/apt/lists/* 61 | 62 | 63 | # Allow OpenSSH to talk to containers without asking for confirmation 64 | RUN mkdir -p /var/run/sshd && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ 65 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 66 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 67 | 68 | # MLNX driver 69 | ARG MOFED_VER=5.3-1.0.0.1 70 | RUN mkdir -p /tmp/mofed && cd /tmp/mofed && \ 71 | wget http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-ubuntu18.04-$(uname -m).tgz && \ 72 | tar -xzvf *.tgz && \ 73 | */mlnxofedinstall --user-space-only --without-fw-update --upstream-libs --dpdk --force && \ 74 | cd /tmp && \ 75 | rm -rf mofed 76 | 77 | ## Install Open MPI 78 | #RUN mkdir /tmp/openmpi && \ 79 | # cd /tmp/openmpi && \ 80 | # wget -q https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.1.tar.gz && \ 81 | # tar zxf openmpi-4.1.1.tar.gz && \ 82 | # cd openmpi-4.1.1 && \ 83 | # ./configure --enable-orterun-prefix-by-default && \ 84 | # make -j $(nproc) all && \ 85 | # make install && \ 86 | # ldconfig && \ 87 | # rm -rf /tmp/openmpi 88 | # 89 | ## Create a wrapper for OpenMPI to allow running as root by default 90 | ## Configure OpenMPI to run good defaults: 91 | ## --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 92 | #RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 93 | # echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 94 | # echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 95 | # chmod a+x /usr/local/bin/mpirun && \ 96 | # echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 97 | # echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ 98 | # echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf 99 | 100 | RUN cd ~ && git clone --branch docker --depth 1 https://github.com/ChenYuHo/omnireduce.git && cd omnireduce && git submodule update --init --depth 1 --recursive daiet && \ 101 | ./build_all.sh INSTALL MLX5 TIMERS NOSCALING ALGO2 SKIP_DAIET SKIP_GLOO SKIP_EXPS SKIP_EXAMPLE 102 | 103 | ARG OMNIREDUCE_CONTAINER_PORT=2222 104 | ENV OMNIREDUCE_CONTAINER_PORT ${OMNIREDUCE_CONTAINER_PORT} 105 | -------------------------------------------------------------------------------- /omnireduce-DPDK/environment.yml: -------------------------------------------------------------------------------- 1 | name: omnireduce 2 | 3 | channels: 4 | - conda-forge 5 | - defaults 6 | - pytorch 7 | 8 | dependencies: 9 | - absl-py<0.9 # WideAndDeep 10 | # - apache-beam # WideAndDeep 11 | - defaults::boost-cpp=1.65 # compatible with daiet 12 | - cffi # horovod, pytorch 13 | - cloudpickle # horovod 14 | - cmake # pytorch 15 | - cudatoolkit-dev=10.1 16 | - cudnn=7.6 17 | - cupy # ncf 18 | - cython=0.28 # pytorch ssd detection 19 | - h5py 20 | - html2text # bert 21 | - libprotobuf=3.8 # tensorflow build 22 | - magma-cuda101 23 | - mkl # pytorch 24 | - mkl-include # pytorch 25 | - mpi4py 26 | - nccl=2.4 27 | - networkx # bert 28 | - ninja # pytorch 29 | - nltk # bert 30 | - numpy<2 # pytorch 31 | - openmpi=4.0 32 | - pandas # ncf 33 | - pip=20.0 34 | - pip: 35 | - git+git://github.com/NVIDIA/dllogger#egg=dllogger # ncf 36 | #- gluoncv 37 | #- mxnet-cu101mkl==1.6.0 38 | - opt_einsum # WideAndDeep 39 | - ray[rllib] 40 | - sacremoses==0.0.35 # Transformer-XL 41 | - scikit-learn # DLRM 42 | - tensorflow-transform==0.21.* # WideAndDeep 43 | - progressbar # bert 44 | - protobuf # WideAndDeep 45 | - psutil # horovod 46 | - pycocotools=2.0 # pytorch ssd detection 47 | - pycparser # horovod 48 | - pydot<2 # WideAndDeep 49 | - py-opencv # GAN 50 | - pyspark 51 | - pytest # bert 52 | - python=3.7 53 | - pyyaml # horovod, pytorch 54 | - scikit-image=0.15 # pytorch ssd detection 55 | - scipy 56 | - setuptools # pytorch 57 | - six<2 # WideAndDeep 58 | - tensorflow-gpu=1.15 59 | - toposort # bert 60 | - tqdm 61 | -------------------------------------------------------------------------------- /omnireduce-DPDK/get_cuda_arch_code.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create a 'here document' that is code we compile and use to probe the card 4 | cat << EOF > /tmp/cudaComputeVersion.cu 5 | #include 6 | int main() 7 | { 8 | cudaDeviceProp prop; 9 | cudaGetDeviceProperties(&prop,0); 10 | printf("%d.%d\n", prop.major,prop.minor); 11 | } 12 | EOF 13 | 14 | # probe the card and cleanup 15 | /usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion 16 | /tmp/cudaComputeVersion 17 | rm /tmp/cudaComputeVersion.cu 18 | rm /tmp/cudaComputeVersion 19 | -------------------------------------------------------------------------------- /omnireduce-DPDK/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | git submodule update --init "$@" --recursive 5 | cd $SCRIPTPATH/pytorch 6 | git apply $SCRIPTPATH/pytorch.patch 7 | git rm third_party/gloo 8 | cd third_party/protobuf 9 | git fetch --unshallow 10 | git checkout 09745575a923640154bcf307fba8aedff47f240a 11 | cd $SCRIPTPATH/gloo 12 | git apply $SCRIPTPATH/gloo.patch 13 | -------------------------------------------------------------------------------- /omnireduce-RDMA/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(OMNIREDUCE_PATH),) 2 | OMNIREDUCE_PATH = $(shell pwd) 3 | export OMNIREDUCE_PATH 4 | endif 5 | 6 | SOURCEDIR := ${OMNIREDUCE_PATH}/omnireduce 7 | DESTDIR := ${OMNIREDUCE_PATH}/build 8 | 9 | INCLUDE :=-I ${OMNIREDUCE_PATH} 10 | LDFLAGS := -shared -lstdc++ 11 | LDLIBS := -libverbs -lboost_system -lboost_thread -lboost_chrono -lboost_program_options 12 | CXXFLAGS := -O3 -std=c++11 13 | ifeq ($(USE_CUDA),ON) 14 | $(info "USE_CUDA ON") 15 | CXXFLAGS += -DUSE_CUDA --compiler-options -fPIC 16 | CC := nvcc 17 | LD := nvcc 18 | else 19 | CXXFLAGS += -fPIC 20 | CC := g++ 21 | LD := g++ 22 | endif 23 | 24 | SOURCE:=${wildcard ${SOURCEDIR}/*.cpp} 25 | OBJS:=${patsubst ${SOURCEDIR}/%.cpp,${SOURCEDIR}/%.o,${SOURCE}} 26 | 27 | ifeq ($(USE_CUDA),ON) 28 | SOURCE:=${wildcard ${SOURCEDIR}/*.cu} 29 | OBJS+=${patsubst ${SOURCEDIR}/%.cu,${SOURCEDIR}/%.o,${SOURCE}} 30 | endif 31 | 32 | TARGET_LIB := libomnireduce.so 33 | 34 | all:${OBJS} 35 | ${LD} ${LDFLAGS} -o ${SOURCEDIR}/${TARGET_LIB} ${OBJS} ${LDLIBS} 36 | mkdir -p ${DESTDIR}/include/omnireduce 37 | cp ${SOURCEDIR}/${TARGET_LIB} ${DESTDIR} 38 | cp ${SOURCEDIR}/*.hpp ${DESTDIR}/include/omnireduce 39 | 40 | ${SOURCEDIR}/%.o:${SOURCEDIR}/%.cpp 41 | ${CC} -c ${CXXFLAGS} $< -o ${SOURCEDIR}/$*.o ${INCLUDE} 42 | 43 | ${SOURCEDIR}/%.o:${SOURCEDIR}/%.cu 44 | ${CC} -c ${CXXFLAGS} $< -o ${SOURCEDIR}/$*.o ${INCLUDE} 45 | 46 | .PHONY: clean 47 | 48 | clean: 49 | rm ${SOURCEDIR}/*.so ${SOURCEDIR}/*.o -rf 50 | rm -rf ${DESTDIR} 51 | -------------------------------------------------------------------------------- /omnireduce-RDMA/README.md: -------------------------------------------------------------------------------- 1 | # OmniReduce-RDMA 2 | 3 | ## Getting Started 4 | The simplest way to start is to use our [docker image](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/docker). We provide a [tutorial](https://github.com/sands-lab/omnireduce/blob/master/omnireduce-RDMA/docs/tutorial.md) to help you run RDMA-based OmniReduce with docker image quickly. 5 | Below, we introduce how to build and use OmniReduce. 6 | 7 | ### Building 8 | OmniReduce is built to run on Linux and the dependencies include CUDA, ibverbs and Boost C++ library. 9 | To build OmniReduce, run: 10 | 11 | git clone https://github.com/sands-lab/omnireduce 12 | cd omnireduce-RDMA 13 | make USE_CUDA=ON 14 | 15 | ### Examples 16 | Basic examples are provided under the [example](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/example) folder. 17 | To reproduce the evaluation in our SIGCOMM'21 paper, find the code at this [repo](https://github.com/sands-lab/omnireduce-experiments). 18 | 19 | ## Frameworks Integration 20 | OmniReduce is only integrated with PyTorch currently. The integration method is under the [frameworks_integration](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/frameworks_integration/pytorch_patch) folder. 21 | 22 | ## Limitations 23 | 24 | - Only support AllReduce operation 25 | - Only support int32 and float data type 26 | -------------------------------------------------------------------------------- /omnireduce-RDMA/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04 2 | 3 | ARG https_proxy 4 | ARG http_proxy 5 | 6 | ARG OMNIREDUCE_BASE_PATH=/usr/local 7 | ARG OMNIREDUCE_PATH=$OMNIREDUCE_BASE_PATH/omnireduce 8 | ARG OMNIREDUCE_GIT_LINK=https://github.com/Phlix1/omnireduce.git 9 | ARG OMNIREDUCE_BRANCH=master 10 | 11 | ARG DEBIAN_FRONTEND=noninteractive 12 | RUN apt-get update 13 | RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 14 | build-essential \ 15 | tzdata \ 16 | ca-certificates \ 17 | git \ 18 | curl \ 19 | wget \ 20 | vim \ 21 | cmake \ 22 | lsb-release \ 23 | libcudnn7=7.6.0.64-1+cuda10.1 \ 24 | libnuma-dev \ 25 | ibverbs-providers \ 26 | librdmacm-dev \ 27 | ibverbs-utils \ 28 | rdmacm-utils \ 29 | libibverbs-dev \ 30 | python3 \ 31 | python3-dev \ 32 | python3-pip \ 33 | python3-setuptools \ 34 | libnccl2=2.4.7-1+cuda10.1 \ 35 | libnccl-dev=2.4.7-1+cuda10.1 \ 36 | iputils-ping \ 37 | net-tools \ 38 | perftest 39 | 40 | RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ 41 | mkdir -p /var/run/sshd 42 | 43 | RUN apt-get install -y --no-install-recommends libboost-all-dev=1.65.1.0ubuntu1 44 | 45 | RUN cd /usr/local && \ 46 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ./miniconda.sh && \ 47 | bash miniconda.sh -b -p /usr/local/conda && \ 48 | rm miniconda.sh 49 | ENV PATH $PATH:/usr/local/conda/bin 50 | 51 | RUN conda install -y -c conda-forge -c defaults -c pytorch magma-cuda101 mkl mkl-include ninja numpy=1.20.1 pyyaml scipy setuptools six=1.15.0 cffi typing_extensions future requests dataclasses 52 | 53 | RUN cd $OMNIREDUCE_BASE_PATH && git clone $OMNIREDUCE_GIT_LINK && cd $OMNIREDUCE_PATH && make USE_CUDA=ON && cp $OMNIREDUCE_PATH/build/libomnireduce.so /usr/lib/x86_64-linux-gnu/ && \ 54 | cp -r $OMNIREDUCE_PATH/build/include/omnireduce /usr/include/ && cd $OMNIREDUCE_PATH/example && CUDA_HOME=/usr/local/cuda/ make USE_CUDA=ON 55 | 56 | RUN cd $OMNIREDUCE_BASE_PATH && git clone --recursive https://github.com/pytorch/pytorch && cd $OMNIREDUCE_BASE_PATH/pytorch && git checkout 57bffc3a8e4fee0cce31e1ff1f662ccf7b16db57 && \ 57 | git submodule sync && git submodule update --init --recursive && git apply $OMNIREDUCE_PATH/frameworks_integration/pytorch_patch/omnireduce-pytorch.patch && \ 58 | USE_SYSTEM_NCCL=0 /usr/local/conda/bin/python setup.py install 59 | 60 | ##experiments 61 | 62 | ARG EXPS_BASE_PATH=/home 63 | ARG EXPS_PATH=$EXPS_BASE_PATH/exps 64 | ARG EXPS_GIT_LINK=https://github.com/Phlix1/exps.git 65 | 66 | RUN cd /usr/bin && rm python 67 | 68 | RUN cd $EXPS_BASE_PATH && git clone $EXPS_GIT_LINK 69 | 70 | #For benchmark 71 | RUN cd $EXPS_PATH/benchmark && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg 72 | 73 | #For DeepLight 74 | RUN pip install -U scikit-learn 75 | RUN cd $EXPS_PATH/models/DeepLight && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg 76 | 77 | #For LSTM 78 | RUN conda install -y -c conda-forge -c defaults -c pytorch cython 79 | RUN cd $EXPS_PATH/models/LSTM/lm/log_uniform && make && python setup.py install 80 | RUN cd $EXPS_PATH/models/LSTM && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg 81 | 82 | #For NCF 83 | RUN conda install -y -c conda-forge numpy-indexed 84 | RUN pip install mlperf_compliance 85 | RUN cd $EXPS_PATH/models/NCF && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg 86 | 87 | #For CNN 88 | RUN pip install Pillow 89 | RUN pip install torchvision===0.8.0 --no-dependencies 90 | RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda100 91 | RUN cd /usr/local && git clone https://github.com/NVIDIA/apex && cd apex && git reset --hard a651e2c24ecf97cbf367fd3f330df36760e1c597 && \ 92 | pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 93 | RUN cd $EXPS_PATH/models/CNN && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg 94 | 95 | #For BERT 96 | RUN pip install nvidia-pyindex 97 | RUN pip install nvidia-dllogger 98 | RUN conda install -y unzip 99 | RUN cd $EXPS_PATH/models/BERT/dataset/checkpoint && \ 100 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_qa_squad11_amp/versions/19.09.0/zip -O bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip && \ 101 | unzip bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip 102 | RUN cd $EXPS_PATH/models/BERT && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg && mkdir results 103 | 104 | RUN pip install jupyter 105 | RUN pip install matplotlib 106 | RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config 107 | RUN sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/' /etc/ssh/ssh_config 108 | ENTRYPOINT /usr/sbin/sshd -p 2222 && /bin/bash 109 | -------------------------------------------------------------------------------- /omnireduce-RDMA/docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker usage 2 | The docker image includes PyTorch with OmniReduce and some experiments in [this repo](https://github.com/sands-lab/omnireduce-experiments). 3 | To build the docker image, run: 4 | 5 | docker build -t omnireduce/pytorch:exps . -f Dockerfile -------------------------------------------------------------------------------- /omnireduce-RDMA/docs/tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial 2 | In this tutorial, we will introduce how to use OmniReduce. We use the docker image to ensure that you don't encounter problems with the system environment. We take the benchmark in [this repo](https://github.com/sands-lab/omnireduce-experiments) as an example to introduce how to use OmniReduce step by step. 3 | 4 | ## Build Image 5 | Build the docker image according to [this](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/docker). 6 | 7 | ## Distributed Training (RDMA) 8 | Let's say you have two workers and two aggregators. Each worker has one GPU. Assume that the network interface to use is `eth0` and the IP addresses are as follows: 9 | 10 | | Machine | IP address | 11 | |--|--| 12 | | worker-0 | 10.0.0.10 | 13 | | worker-1 | 10.0.0.11 | 14 | | aggregator-0 | 10.0.0.20 | 15 | | aggregator-1 | 10.0.0.21 | 16 | 17 | ### Create configuration file 18 | Firstly, you need to create the `omnireduce.cfg` according to [this](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/example#1-configuration-file). The following parameters need to be updated: 19 | 20 | | Parameter | Value | 21 | |--|--| 22 | | `num_worker` | 2 | 23 | | `num_aggregator` | 2 | 24 | | `worker_ips` | 10.0.0.10,10.0.0.11 | 25 | | `worker_ips` | 10.0.0.20,10.0.0.21 | 26 | 27 | If your worker supports GPUDirect, set `direct_memory` to be 1. With regard to RDMA configuration, you need to update related parameters according to you system information. You can use the MLNX OFED's [show_gids](https://enterprise-support.nvidia.com/s/article/understanding-show-gids-script) script to get the device (`ib_hca`), port(`ib_port`) and index GID(`gid_idx`). 28 | 29 | ### Run benchmark 30 | 31 | For aggregator-0 and aggregator-1: 32 | 33 | docker run -it --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash 34 | # now you are in docker environment 35 | # step 1: update /usr/local/omnireduce/example/omnireduce.cfg 36 | # step 2: start aggregator 37 | cd /usr/local/omnireduce/example 38 | ./aggregator 39 | 40 | For worker-0 41 | 42 | docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash 43 | # now you are in docker environment 44 | # step 1: update /usr/local/omnireduce/example/omnireduce.cfg 45 | # step 2: start worker 0 46 | cd /home/exps/benchmark 47 | CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 python benchmark.py -d 1.0 --backend gloo -t 26214400 -r 0 -s 2 --ip 10.0.0.10 48 | 49 | For worker-1 50 | 51 | docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash 52 | # now you are in docker environment 53 | # step 1: update /usr/local/omnireduce/example/omnireduce.cfg 54 | # step 2: start worker 0 55 | cd /home/exps/benchmark 56 | CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 python benchmark.py -d 1.0 --backend gloo -t 26214400 -r 1 -s 2 --ip 10.0.0.10 57 | 58 | ### Run end-to-end 59 | To run the end-to-end experiments, please refer to [this](https://github.com/sands-lab/omnireduce-experiments/tree/master/models). Here we take LSTM training as an example. 60 | 61 | #### LSTM training 62 | 63 | For aggregator-0 and aggregator-1: 64 | 65 | docker run -it --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash 66 | # now you are in docker environment 67 | # step 1: update /usr/local/omnireduce/example/omnireduce.cfg 68 | # step 2: start aggregator 69 | cd /usr/local/omnireduce/example 70 | ./aggregator 71 | 72 | For worker-0 73 | 74 | docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash 75 | # now you are in docker environment 76 | # step 1: update /usr/local/omnireduce/example/omnireduce.cfg 77 | # step 2: start worker 0 78 | cd /home/exps/models/LSTM 79 | CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 OMPI_COMM_WORLD_SIZE=2 OMPI_COMM_WORLD_RANK=0 OMPI_COMM_WORLD_LOCAL_RANK=0 ./run.sh --init tcp://10.0.0.10:4000 --backend gloo 80 | 81 | For worker-1 82 | 83 | docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash 84 | # now you are in docker environment 85 | # step 1: update /usr/local/omnireduce/example/omnireduce.cfg 86 | # step 2: start worker 0 87 | cd /home/exps/models/LSTM 88 | CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 OMPI_COMM_WORLD_SIZE=2 OMPI_COMM_WORLD_RANK=1 OMPI_COMM_WORLD_LOCAL_RANK=0 ./run.sh --init tcp://10.0.0.10:4000 --backend gloo 89 | -------------------------------------------------------------------------------- /omnireduce-RDMA/example/Makefile: -------------------------------------------------------------------------------- 1 | MPICC := mpicxx 2 | CC := g++ 3 | INCLUDE :=-I../build/include 4 | LIBPATH :=-L../build 5 | LDLIBS := -libverbs -lboost_system -lboost_thread -lboost_chrono -lboost_program_options -lomnireduce 6 | CXXFLAGS := -Wall -Wextra -fPIC -O3 -std=c++11 7 | 8 | APPS := worker aggregator 9 | ifeq ($(USE_CUDA),ON) 10 | INCLUDE += -I${CUDA_HOME}/include 11 | LIBPATH += -L${CUDA_HOME}/lib -L${CUDA_HOME}/lib64 12 | APPS += cuda_worker 13 | CXXFLAGS += -DUSE_CUDA 14 | LDLIBS += -lcudart 15 | endif 16 | 17 | all: ${APPS} 18 | 19 | worker: worker_test.cpp 20 | ${MPICC} ${INCLUDE} ${LIBPATH} ${CXXFLAGS} -o $@ $^ ${LDLIBS} 21 | 22 | cuda_worker: cuda_worker_test.cpp 23 | ${MPICC} ${INCLUDE} ${LIBPATH} ${CXXFLAGS} -o $@ $^ ${LDLIBS} 24 | 25 | aggregator: aggregator_test.cpp 26 | ${CC} ${INCLUDE} ${LIBPATH} ${CXXFLAGS} -o $@ $^ ${LDLIBS} 27 | 28 | clean: 29 | rm -f *.o ${APPS} cuda_worker 30 | -------------------------------------------------------------------------------- /omnireduce-RDMA/example/README.md: -------------------------------------------------------------------------------- 1 | # OmniReduce Examples 2 | ## Building 3 | MPI compiler (MPICH/OpenMPI) is required to build the example. 4 | To build example, run: 5 | ``` shell 6 | make USE_CUDA=ON 7 | ``` 8 | After building, the output programs include `worker`, `cuda_worker` and `aggregator`. 9 | ## Run example 10 | ### 1. Configuration file 11 | Before running the example, the [omnireduce.cfg](https://github.com/sands-lab/omnireduce/blob/master/omnireduce-RDMA/example/omnireduce.cfg) requires to be edited according to the cluster. This file needs to be copied to all the workers and aggregators. 12 | Below, we introduce the parameters in the configuration file. 13 | - **RDMA configuration** 14 | - **`ib_hca`** specify which RDMA interfaces to use for communication. Example:mlx5_1. 15 | - **`ib_port`**: specify the port number of the RDMA interface. 16 | - **`gid_idx`**: specify GID index. 17 | - **`sl`**: set the service level. 18 | - **`num_threads`**: number of threads used for communication for both workers and aggregators. 19 | - **`worker_cores`** and **`aggregator_cores`**: set CPU affinity for threads. The number of values should be equal to the `num_threads` parameter. value -1 means no CPU affinity setting and values $\geq$ 0 mean the core ids for different threads. 20 | - **Worker configuration** 21 | - **`num_workers`**: number of workers. 22 | - **`threshold`**: threshold for calculating block bitmap. 23 | - **`direct_memory`**: enable GPUDirect. Value 1 means using GDR. 24 | - **`buffer_size`**: send/recv buffer size (only used when `direct_memory`=1). 25 | - **`message_size`**: RDMA message size. 26 | - **`block_size`**: block size used in OmniReduce algorithm. 27 | - **`gpu_devId`**: index of the used GPU. 28 | - **`worker_ips`**: IP addresses of workers, used for negotiation. 29 | - **Aggregator configuration** 30 | - **`num_aggregators`**: number of workers. 31 | - **`aggregator_ips`**: IP addresses of aggregators, used for negotiation. 32 | 33 | `bitmap_chunk_size` and `adaptive_blocksize` are not used in current version. 34 | 35 | ### 2. Run aggregators 36 | For each aggregator, copy program to each aggregator and run: 37 | 38 | ./aggregator 39 | 40 | ### 3. Run workers 41 | Run `worker` program with `mpirun` on one worker. Here is an example with MPICH. 42 | 43 | mpirun -n num_workers -hosts IP_1,...,IP_n ./cuda_worker 44 | -------------------------------------------------------------------------------- /omnireduce-RDMA/example/aggregator_test.cpp: -------------------------------------------------------------------------------- 1 | #include "omnireduce/aggcontext.hpp" 2 | int main() { 3 | omnireduce::AggContext& omniContext = omnireduce::AggContext::getInstance(); 4 | return 0; 5 | } 6 | -------------------------------------------------------------------------------- /omnireduce-RDMA/example/cuda_worker_test.cpp: -------------------------------------------------------------------------------- 1 | #include "omnireduce/context.hpp" 2 | #include 3 | #include 4 | #include "mpi.h" 5 | #include 6 | #define DATA_TYPE float 7 | //#define DATA_TYPE int 8 | 9 | int main(int argc, char *argv[]) { 10 | int devID=0; 11 | cudaSetDevice(devID); 12 | cudaDeviceProp deviceProps; 13 | cudaGetDeviceProperties(&deviceProps, devID); 14 | cudaStream_t stream; 15 | cudaStreamCreate(&stream); 16 | printf("CUDA device [%s]\n", deviceProps.name); 17 | MPI_Init(&argc, &argv); 18 | int myrank=0, worldsize=1; 19 | MPI_Comm_size(MPI_COMM_WORLD, &worldsize); 20 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank); 21 | omnireduce::OmniContext& omniContext = omnireduce::OmniContext::getInstance(); 22 | srand(omniContext.workerId+1); 23 | uint32_t block_size = omnireduce::omnireduce_par.getBlockSize(); 24 | uint32_t tensor_size = 67108864; 25 | uint32_t block_count = tensor_size/block_size; 26 | if (tensor_size%block_size!=0) 27 | block_count += 1; 28 | DATA_TYPE *input = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE)); 29 | DATA_TYPE *d_input; 30 | cudaMalloc((void **)&d_input, tensor_size*sizeof(DATA_TYPE)); 31 | cudaMemset(d_input, 0, tensor_size*sizeof(DATA_TYPE)); 32 | DATA_TYPE *output = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE)); 33 | DATA_TYPE *output_dev = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE)); 34 | memset(input, 0, tensor_size*sizeof(DATA_TYPE)); 35 | uint8_t *bitmap = (uint8_t *)malloc(block_count*sizeof(uint8_t)); 36 | double density_ratio = 0.01; 37 | double rnum = 0; 38 | for(uint32_t i=0; i 3 | #include 4 | #include "mpi.h" 5 | #define DATA_TYPE float 6 | //#define DATA_TYPE int 7 | 8 | int main(int argc, char *argv[]) { 9 | MPI_Init(&argc, &argv); 10 | int myrank=0, worldsize=1; 11 | MPI_Comm_size(MPI_COMM_WORLD, &worldsize); 12 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank); 13 | omnireduce::OmniContext& omniContext = omnireduce::OmniContext::getInstance(); 14 | srand(omniContext.workerId+1); 15 | uint32_t block_size = omnireduce::omnireduce_par.getBlockSize(); 16 | uint32_t tensor_size = 67108864; 17 | uint32_t block_count = tensor_size/block_size; 18 | if (tensor_size%block_size!=0) 19 | block_count += 1; 20 | DATA_TYPE *input = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE)); 21 | DATA_TYPE *output = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE)); 22 | DATA_TYPE *data = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE)); 23 | memset(input, 0, tensor_size*sizeof(int)); 24 | uint8_t *bitmap = (uint8_t *)malloc(block_count*sizeof(uint8_t)); 25 | double density_ratio = 0.01; 26 | double rnum = 0; 27 | for(uint32_t i=0; i=5.3.1` to complie boost from source: 34 | 35 | tar zxvf boost_1_65_1.tar.gz 36 | cd boost_1_65_1 37 | ./bootstrap.sh 38 | ./b2 install --with=all 39 | 40 | ## 4. Install OmniReduce 41 | Build OmniReduce and copy the omnireduce `dynami library` and `header files` in the `build` folder to the system `library` and `include` path. 42 | 43 | cd horovod/third_party/omnireduce/omnireduce-RDMA 44 | make USE_CUDA=ON 45 | cp ./build/libomnireduce.so SYSTEM_LIBRARY_PATH 46 | cp -r ./build/include/omnireduce SYSTEM_INCLUDE_PATH 47 | cd horovod/third_party/omnireduce/omnireduce-RDMA/example 48 | make USE_CUDA=ON 49 | 50 | ## 5. Apply patch to Horovod 51 | 52 | cd horovod 53 | git apply omnireduce-horovod.patch 54 | 55 | ## 6. Build Horovod with OmniReduce 56 | Before install Horoovd, we need to install torch and tensorflow. The recommended version environment is: 57 | - **gcc version:** gcc>=5.3.1 58 | - **Python version:** Python <= 3.7, 3.8 is not support for tensorflow1.15 59 | - **Torch version:** Pytorch 1.6 for CUDA10.0, pytorch 1.7 for CUDA10.1 or CUDA11.0, pytorch1.8 for CUDA11.0 60 | - **Tensorflow version:** Tensorflow == 1.15 for CUDA10.0 (tensorflow official not support CUDA11.0 for tensorflow1.x). Besides that, we currently not support tensorflow2.x. 61 | 62 | According to the Horovod official [repository](https://github.com/horovod/horovod/tree/v0.19.4#install), we only replace `HOROVOD_GPU_ALLREDUCE=NCCL` into `HOROVOD_GPU_ALLREDUCE=OMNI` and then compiling horovod from source. We both support omnireduce for TensorFlow and PyTorch in Horovod. 63 | 64 | cd horovod 65 | CC=`which gcc` CXX=`which g++` HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_NCCL_LINK=SHARED HOROVOD_GPU_ALLREDUCE=OMNI HOROVOD_GPU_BROADCAST=NCCL python3.6 setup.py install 66 | 67 | ## 7. Running Horovod With OmniReduce 68 | 69 | Before running, we must to configure the `omnireduce.cfg` file. Then: 70 | - Launch `./aggregator` in `horovod/third_party/omnireduce/omnireduce-RDMA/example` for all aggregator's machines. 71 | - Configure the hostfile and use `mpirun` only in master-worker machine to launch Horovod. The entire command as follows: 72 | 73 | mpirun --hostfile ./hostfile -map-by slot --display-map --tag-output --timestamp-output --mca btl_tcp_if_exclude lo,docker0 -x NCCL_NET_GDR_READ=1 -x NCCL_IB_HCA=mlx5_0 -x NCCL_DEBUG=INFO -x NCCL_SOCKET_IFNAME=^lo,docker0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_DISABLE=0 -x HOROVOD_MPI_THREADS_DISABLE=1 python3 test_hvd_torch.py (or test_hvd_tensorflow.py) 74 | 75 | The hostfile of 2 machines 2 gpus is configured as followed: 76 | 77 | machine1_ip port=xxx max_slots=1 78 | machine2_ip port=xxx max_slots=1 79 | 80 | Since each node/host uses 1 GPU, CUDA_VISIBLE_DEVICES is must be set one gpu device id. In order to GDR, we need to check the machine's topo by `nvidia-smi topo -m`and select the gpu device id which binds to the network card. 81 | 82 | ## 8. Horovod Timeline With OmniReduce 83 | The OmniReduce implements allreduce op contains synchronize which leads to inaccurate time-consuming at each stage (`MEMCPY_IN_FUSION_BUFFER`, `OMNI_ALLREDUCE` and `MEMCPY_OUT_FUSION_BUFFER`) in omniAllreduce's Execute function. For solving this problem, we support the feature in Horovod timeline which can accurate statistics time-consuming for synchronize's op. 84 | 85 | For example, the follow picture shows the `OMNI_ALLREDUCE` time-consuming. The red box marks the original horovod timeline time-consuming and the green box marks the time-consuming after correction. 86 | 87 | ![image](https://user-images.githubusercontent.com/25579435/125772127-b00c1518-fe44-4461-bbd3-d92879d8d050.png) 88 | 89 | -------------------------------------------------------------------------------- /omnireduce-RDMA/frameworks_integration/horovod_patch/test_hvd_tensorflow.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import, division, print_function 3 | 4 | import argparse 5 | import os, sys 6 | import numpy as np 7 | import timeit 8 | 9 | import horovod.tensorflow as hvd 10 | from tensorflow.keras import applications 11 | import tensorflow as tf 12 | 13 | # Benchmark settings 14 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', 15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 16 | parser.add_argument('--fp16-pushpull', action='store_true', default=False, 17 | help='use fp16 compression during pushpull') 18 | 19 | parser.add_argument('--model', type=str, default='ResNet50', 20 | help='model to benchmark') 21 | parser.add_argument('--batch-size', type=int, default=32, 22 | help='input batch size') 23 | 24 | parser.add_argument('--num-warmup-batches', type=int, default=10, 25 | help='number of warm-up batches that don\'t count towards benchmark') 26 | parser.add_argument('--num-batches-per-iter', type=int, default=10, 27 | help='number of batches per benchmark iteration') 28 | parser.add_argument('--num-iters', type=int, default=10, 29 | help='number of benchmark iterations') 30 | 31 | parser.add_argument('--eager', action='store_true', default=False, 32 | help='enables eager execution') 33 | parser.add_argument('--no-cuda', action='store_true', default=False, 34 | help='disables CUDA training') 35 | 36 | args = parser.parse_args() 37 | args.cuda = not args.no_cuda 38 | 39 | hvd.init() 40 | 41 | # Horovod: pin GPU to be used to process local rank (one GPU per process) 42 | config = tf.ConfigProto() 43 | if args.cuda: 44 | config.gpu_options.allow_growth = True 45 | config.gpu_options.visible_device_list = str(hvd.local_rank()) 46 | else: 47 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 48 | config.gpu_options.allow_growth = False 49 | config.gpu_options.visible_device_list = '' 50 | 51 | if args.eager: 52 | tf.enable_eager_execution(config) 53 | 54 | # Set up standard model. 55 | # Check https://github.com/keras-team/keras-applications for all supported models, e.g., ResNet50, VGG16 56 | model = getattr(applications, args.model)(weights=None) 57 | 58 | opt = tf.train.GradientDescentOptimizer(0.01) 59 | 60 | # Horovod: (optional) compression algorithm. 61 | compression = hvd.Compression.fp16 if args.fp16_pushpull else hvd.Compression.none 62 | 63 | # Horovod: wrap optimizer with DistributedOptimizer. 64 | opt = hvd.DistributedOptimizer(opt, compression=compression) 65 | 66 | init = tf.global_variables_initializer() 67 | bcast_op = hvd.broadcast_global_variables(0) 68 | 69 | data = tf.random_uniform([args.batch_size, 224, 224, 3]) 70 | target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) 71 | 72 | 73 | def loss_function(): 74 | logits = model(data, training=True) 75 | return tf.losses.sparse_softmax_cross_entropy(target, logits) 76 | 77 | 78 | def log(s, nl=True): 79 | if hvd.rank() != 0: 80 | return 81 | print(s, end='\n' if nl else '') 82 | sys.stdout.flush() 83 | 84 | log('Model: %s' % args.model) 85 | log('Batch size: %d' % args.batch_size) 86 | device = 'GPU' if args.cuda else 'CPU' 87 | log('Number of %ss: %d' % (device, hvd.size())) 88 | 89 | 90 | def run(benchmark_step): 91 | # Warm-up 92 | log('Running warmup...') 93 | timeit.timeit(benchmark_step, number=args.num_warmup_batches) 94 | 95 | # Benchmark 96 | log('Running benchmark...') 97 | img_secs = [] 98 | for x in range(args.num_iters): 99 | time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) 100 | img_sec = args.batch_size * args.num_batches_per_iter / time 101 | log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) 102 | img_secs.append(img_sec) 103 | 104 | # Results 105 | img_sec_mean = np.mean(img_secs) 106 | img_sec_conf = 1.96 * np.std(img_secs) 107 | log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) 108 | log('Total img/sec on %d %s(s): %.1f +-%.1f' % 109 | (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) 110 | 111 | 112 | if tf.executing_eagerly(): 113 | with tf.device(device): 114 | run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) 115 | else: 116 | with tf.Session(config=config) as session: 117 | init.run() 118 | bcast_op.run() 119 | 120 | loss = loss_function() 121 | train_opt = opt.minimize(loss) 122 | run(lambda: session.run(train_opt)) 123 | -------------------------------------------------------------------------------- /omnireduce-RDMA/frameworks_integration/horovod_patch/test_hvd_torch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import torch.backends.cudnn as cudnn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | import torch.utils.data.distributed 8 | from torchvision import models 9 | import horovod.torch as hvd 10 | import timeit 11 | import numpy as np 12 | import os, sys 13 | 14 | # Benchmark settings 15 | parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark', 16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 17 | parser.add_argument('--fp16-pushpull', action='store_true', default=False, 18 | help='use fp16 compression during byteps pushpull') 19 | 20 | parser.add_argument('--model', type=str, default='resnet50', 21 | help='model to benchmark') 22 | parser.add_argument('--batch-size', type=int, default=64, 23 | help='input batch size') 24 | 25 | parser.add_argument('--num-warmup-batches', type=int, default=1, 26 | help='number of warm-up batches that don\'t count towards benchmark') 27 | parser.add_argument('--num-batches-per-iter', type=int, default=10, 28 | help='number of batches per benchmark iteration') 29 | parser.add_argument('--num-iters', type=int, default=10, 30 | help='number of benchmark iterations') 31 | parser.add_argument('--num-classes', type=int, default=1000, 32 | help='number of classes') 33 | 34 | parser.add_argument('--no-cuda', action='store_true', default=False, 35 | help='disables CUDA training') 36 | parser.add_argument('--profiler', action='store_true', default=False, 37 | help='disables profiler') 38 | parser.add_argument('--partition', type=int, default=None, 39 | help='partition size') 40 | 41 | 42 | args = parser.parse_args() 43 | args.cuda = not args.no_cuda and torch.cuda.is_available() 44 | 45 | hvd.init() 46 | 47 | if args.cuda: 48 | # Horovod: pin GPU to local rank. 49 | cuda_device = torch.device('cuda', hvd.local_rank() % hvd.size()) 50 | torch.cuda.set_device(cuda_device) 51 | 52 | cudnn.benchmark = True 53 | 54 | # Set up standard model. 55 | model = getattr(models, args.model)(num_classes=args.num_classes) 56 | 57 | if args.cuda: 58 | # Move model to GPU. 59 | model.cuda() 60 | 61 | optimizer = optim.SGD(model.parameters(), lr=0.01) 62 | 63 | # Horovod: (optional) compression algorithm. 64 | compression = hvd.Compression.fp16 if args.fp16_pushpull else hvd.Compression.none 65 | 66 | # Horovod: wrap optimizer with DistributedOptimizer. 67 | optimizer = hvd.DistributedOptimizer(optimizer, 68 | named_parameters=model.named_parameters()) 69 | 70 | # Horovod: broadcast parameters & optimizer state. 71 | hvd.broadcast_parameters(model.state_dict(), root_rank=0) 72 | hvd.broadcast_optimizer_state(optimizer, root_rank=0) 73 | 74 | data_index = 0 75 | 76 | def benchmark_step(): 77 | global data_index 78 | global cuda_device 79 | 80 | #data = datasets[data_index%len(datasets)] 81 | data = torch.rand(args.batch_size, 3, 224, 224, device=cuda_device) 82 | target = torch.randint(0, 1000, (args.batch_size,), device=cuda_device) 83 | data_index += 1 84 | optimizer.zero_grad() 85 | output = model(data) 86 | loss = F.cross_entropy(output, target) 87 | loss.backward() 88 | optimizer.step() 89 | 90 | 91 | def log(s, nl=True): 92 | if hvd.rank() != 0: 93 | return 94 | print(s, end='\n' if nl else '') 95 | sys.stdout.flush() 96 | 97 | 98 | log('Model: %s' % args.model) 99 | log('Batch size: %d' % args.batch_size) 100 | device = 'GPU' if args.cuda else 'CPU' 101 | log('Number of %ss: %d' % (device, hvd.size())) 102 | 103 | # Warm-up 104 | log('Running warmup...') 105 | timeit.timeit(benchmark_step, number=args.num_warmup_batches) 106 | 107 | # Benchmark 108 | log('Running benchmark...') 109 | img_secs = [] 110 | 111 | for x in range(args.num_iters): 112 | time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) 113 | img_sec = args.batch_size * args.num_batches_per_iter / time 114 | log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) 115 | img_secs.append(img_sec) 116 | 117 | 118 | # Results 119 | img_sec_mean = np.mean(img_secs) 120 | img_sec_conf = 1.96 * np.std(img_secs) 121 | log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) 122 | log('Total img/sec on %d %s(s): %.1f +-%.1f' % 123 | (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) 124 | 125 | -------------------------------------------------------------------------------- /omnireduce-RDMA/frameworks_integration/pytorch_patch/README.md: -------------------------------------------------------------------------------- 1 | # Frameworks Integration 2 | By changing a few lines of code in PyTorch we are able to delegate allreduce SUM operations to OmniReduce. 3 | 4 | We take advantage of PyTorch's gloo backend and customize it so that it uses OmniReduce instead of Gloo for operations and data types that OmniReduce supports. 5 | If a job is not supported by OmniReduce then PyTorch automatically fallsback to using gloo. 6 | 7 | For OmniReduce to take over from Gloo the following conditions must be met: 8 | - The all reduce operation must be a summation 9 | - The data type must be float or int32 10 | - Each node/host produces 1 tensor or in other words each node/host uses 1 GPU. 11 | 12 | ## 1. Install OmniReduce 13 | Build OmniReduce and copy the omnireduce `dynami library` and `header files` in the `build` folder to the system `library` and `include` path. 14 | 15 | cp ./build/libomnireduce.so SYSTEM_INCLUDE_PATH 16 | cp -r ./build/include/omnireduce SYSTEM_LIBRARY_PATH 17 | 18 | ## 2. Download PyTorch 19 | The patch applies to a specific commit which we must checkout to. 20 | The PyTorch patch applies to a specific commit which we must checkout to. 21 | 22 | git clone https://github.com/pytorch/pytorch.git 23 | cd pytorch 24 | git checkout 57bffc3 # The 1.7.1 version 25 | git submodule sync 26 | git submodule update --init --recursive 27 | 28 | This will also take a good while to clone and checkout all submodules. 29 | ## 3. Apply patch to PyTorch 30 | 31 | cd pytorch 32 | git apply omnireduce-pytorch.patch 33 | 34 | ## 4. Build PyTorch 35 | Install Boost C++ library with below command: 36 | 37 | apt-get install -y libboost-all-dev=1.65.1.0ubuntu1 38 | 39 | Install PyTorch dependencies and build PyTorch according to the official [repository](https://github.com/pytorch/pytorch#installation). -------------------------------------------------------------------------------- /omnireduce-RDMA/frameworks_integration/pytorch_patch/omnireduce-pytorch.patch: -------------------------------------------------------------------------------- 1 | From be02e214e79f0a4326404c6de30c0fcfda252a8d Mon Sep 17 00:00:00 2001 2 | From: Phlix1 <819108840@qq.com> 3 | Date: Fri, 19 Mar 2021 18:18:15 +0000 4 | Subject: [PATCH] Add OmniReduce Support 5 | 6 | --- 7 | torch/csrc/distributed/c10d/init.cpp | 2 +- 8 | torch/lib/c10d/CMakeLists.txt | 5 ++ 9 | torch/lib/c10d/ProcessGroupGloo.cpp | 84 ++++++++++++++++++++-------- 10 | torch/lib/c10d/ProcessGroupGloo.hpp | 2 + 11 | 4 files changed, 70 insertions(+), 23 deletions(-) 12 | 13 | diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp 14 | index 715403ac57..147fd305d0 100644 15 | --- a/torch/csrc/distributed/c10d/init.cpp 16 | +++ b/torch/csrc/distributed/c10d/init.cpp 17 | @@ -913,7 +913,7 @@ Arguments: 18 | } 19 | 20 | options.timeout = timeout; 21 | - options.threads = options.devices.size() * 2; 22 | + options.threads = 1;//options.devices.size() * 2; 23 | return std::make_shared<::c10d::ProcessGroupGloo>( 24 | store, rank, size, options); 25 | }), 26 | diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt 27 | index 4b206f3801..b6db9afa9f 100644 28 | --- a/torch/lib/c10d/CMakeLists.txt 29 | +++ b/torch/lib/c10d/CMakeLists.txt 30 | @@ -76,6 +76,7 @@ if(USE_C10D_GLOO) 31 | endif() 32 | 33 | add_library(c10d STATIC ${C10D_SRCS}) 34 | +target_link_libraries(c10d PUBLIC boost_system boost_thread boost_chrono boost_program_options omnireduce) 35 | set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON) 36 | set_property(TARGET c10d PROPERTY CXX_STANDARD 14) 37 | 38 | diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp 39 | index c139ac7a34..d16a5cdab9 100644 40 | --- a/torch/lib/c10d/ProcessGroupGloo.cpp 41 | +++ b/torch/lib/c10d/ProcessGroupGloo.cpp 42 | @@ -1185,16 +1185,29 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork { 43 | const std::shared_ptr& context, 44 | std::vector& inputs, 45 | ReduceOp reduceOp, 46 | - uint32_t tag) 47 | - : AsyncAllreduceWork(context, inputs, reduceOp, tag) { 48 | - initializeStreamsEvents(inputs, streams, events); 49 | + uint32_t tag, 50 | + omnireduce::OmniContext& omniContext) 51 | + : AsyncAllreduceWork(context, inputs, reduceOp, tag), omniContext(omniContext) { 52 | + const auto& scalarType = inputs[0].scalar_type(); 53 | + if (reduceOp == ReduceOp::SUM && (scalarType == ::at::ScalarType::Float)) { 54 | + initializeStreamsEvents(inputs, streams, events); 55 | + at::cuda::OptionalCUDAStreamGuard guard; 56 | + for (size_t i = 0; i < inputs.size(); i++) { 57 | + guard.reset_stream(streams[i]); 58 | + } 59 | + use_omnireduce=true; 60 | + } 61 | + else { 62 | + initializeStreamsEvents(inputs, streams, events); 63 | 64 | - // Kick off copy from CUDA tensors to pinned CPU tensors. 65 | - tmp.reserve(inputs.size()); 66 | - at::cuda::OptionalCUDAStreamGuard guard; 67 | - for (size_t i = 0; i < inputs.size(); i++) { 68 | - guard.reset_stream(streams[i]); 69 | - tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true)); 70 | + // Kick off copy from CUDA tensors to pinned CPU tensors. 71 | + tmp.reserve(inputs.size()); 72 | + at::cuda::OptionalCUDAStreamGuard guard; 73 | + for (size_t i = 0; i < inputs.size(); i++) { 74 | + guard.reset_stream(streams[i]); 75 | + tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true)); 76 | + } 77 | + use_omnireduce=false; 78 | } 79 | } 80 | 81 | @@ -1207,13 +1220,33 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork { 82 | } 83 | 84 | // Run allreduce on host side tensors. 85 | - allreduce(tmp); 86 | - 87 | - at::cuda::OptionalCUDAStreamGuard stream_guard; 88 | - for (size_t i = 0; i < inputs.size(); i++) { 89 | - stream_guard.reset_stream(streams[i]); 90 | - inputs[i].copy_(tmp[i], /* non_blocking */ true); 91 | - events[i].record(streams[i]); 92 | + if (use_omnireduce) { 93 | + const auto& scalarType = inputs[0].scalar_type(); 94 | + switch (scalarType) { 95 | + case ::at::ScalarType::Float: 96 | + //omniContext.AllReduce(getDataPointer(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index(), true, false); 97 | + omniContext.AllReduce(getDataPointer(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index()); 98 | + break; 99 | + case ::at::ScalarType::Int: 100 | + //omniContext.AllReduce(getDataPointer(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index(), true, false); 101 | + omniContext.AllReduce(getDataPointer(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index()); 102 | + break; 103 | + default: 104 | + std::cerr<<"Data type error"< tmp; 142 | std::vector streams; 143 | std::vector events; 144 | + std::vector events_bitmap; 145 | + std::vector bitmaps; 146 | + std::vector tmp_bitmap; 147 | + std::vector streams_bitmap; 148 | }; 149 | 150 | class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork { 151 | @@ -1344,7 +1384,7 @@ std::shared_ptr ProcessGroupGloo::allreduce( 152 | } else if (device.type() == at::kCUDA) { 153 | if (layout == c10::kStrided) { 154 | work = std::make_shared( 155 | - std::move(context), inputs, opts.reduceOp, tag); 156 | + std::move(context), inputs, opts.reduceOp, tag, omniContext); 157 | } else if (layout == c10::kSparse) { 158 | work = std::make_shared( 159 | std::move(context), inputs, tag); 160 | diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp 161 | index dfae068de2..ac3c677dbb 100644 162 | --- a/torch/lib/c10d/ProcessGroupGloo.hpp 163 | +++ b/torch/lib/c10d/ProcessGroupGloo.hpp 164 | @@ -12,6 +12,7 @@ 165 | #include 166 | #include 167 | #include 168 | +#include 169 | 170 | #include 171 | 172 | @@ -235,6 +236,7 @@ class ProcessGroupGloo : public ProcessGroup { 173 | // In order to use more than one device (or allow for parallelism on 174 | // a single device), you need multiple contexts. 175 | std::vector> contexts_; 176 | + omnireduce::OmniContext& omniContext = omnireduce::OmniContext::getInstance(); 177 | std::vector threads_; 178 | bool stop_; 179 | 180 | -- 181 | 2.17.1 182 | 183 | -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/aggcontext.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "omnireduce/common.hpp" 3 | 4 | namespace omnireduce { 5 | class AggContext { 6 | public: 7 | static AggContext& getInstance() { 8 | static AggContext instance; 9 | return instance; 10 | } 11 | AggContext(AggContext const&) = delete; 12 | void operator=(AggContext const&) = delete; 13 | uint32_t num_server_threads; 14 | int ret; 15 | int serverId; 16 | int tensor_size; 17 | TensorUpdateType typecode; 18 | uint32_t element_size; 19 | int *socks; 20 | void *comm_buf; 21 | struct ibv_context *ib_ctx; 22 | struct ibv_port_attr port_attr; 23 | struct ibv_pd *pd; 24 | struct ibv_cq **cq; 25 | struct ibv_qp **qp; 26 | struct ibv_cq *cq_address; 27 | struct ibv_qp **qp_address; 28 | struct ibv_mr *mr; 29 | uint32_t **srcs_; 30 | struct ibv_mr **mrs_; 31 | uint32_t **current_offset_thread; 32 | struct remote_con_data_t *remote_props_array; 33 | std::atomic_uint_fast32_t threadid; 34 | AggContext(); 35 | ~AggContext(); 36 | void init(); 37 | void StartMaster(); 38 | void StopMaster(); 39 | int post_receive_address(uint32_t); 40 | int post_send_ready(uint32_t); 41 | void wait_master_ready(); 42 | void set_master_ready(); 43 | pthread_t aggmasterThread; 44 | boost::mutex master_ready_mutex; 45 | boost::condition_variable master_ready_event; 46 | uint32_t master_ready; 47 | }; 48 | } 49 | -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/aggregator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * OmniReduce project 3 | * author: jiawei.fei@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | namespace omnireduce { 9 | void *aggregator(void*); 10 | void *dr_aggregator(void*); 11 | } -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/common.cpp: -------------------------------------------------------------------------------- 1 | #include "omnireduce/common.hpp" 2 | 3 | namespace omnireduce { 4 | 5 | volatile bool force_quit; 6 | } 7 | -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/common.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * OmniReduce project 3 | * author: jiawei.fei@kaust.edu.sa 4 | */ 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "omnireduce/params.hpp" 17 | 18 | #define likely(x) __builtin_expect(!!(x), 1) 19 | #define unlikely(x) __builtin_expect(!!(x), 0) 20 | 21 | #if __BYTE_ORDER == __LITTLE_ENDIAN 22 | static inline uint64_t htonll(uint64_t x) { return bswap_64(x); } 23 | static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); } 24 | #elif __BYTE_ORDER == __BIG_ENDIAN 25 | static inline uint64_t htonll(uint64_t x) { return x; } 26 | static inline uint64_t ntohll(uint64_t x) { return x; } 27 | #else 28 | #error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN 29 | #endif 30 | 31 | namespace omnireduce { 32 | 33 | struct remote_con_data_t 34 | { 35 | int remoteId; 36 | uint64_t addr; /* Buffer address */ 37 | uint32_t rkey; /* Remote key */ 38 | uint32_t qp_num[MAX_NUM_AGGS*MAX_NUM_QPS*MAX_NUM_THREADS+1]; /* QP number */ 39 | uint16_t lid; /* LID of the IB port */ 40 | uint8_t gid[16]; /* gid */ 41 | }; 42 | 43 | struct cm_con_data_t 44 | { 45 | int remoteId; 46 | uint32_t num_peers; 47 | uint64_t addr; /* Buffer address */ 48 | uint32_t rkey; /* Remote key */ 49 | uint32_t qp_num[MAX_NUM_AGGS*MAX_NUM_QPS*MAX_NUM_THREADS+1]; /* QP number */ 50 | uint16_t lid; /* LID of the IB port */ 51 | uint8_t gid[16]; /* gid */ 52 | } __attribute__((packed)); 53 | 54 | enum TensorUpdateType { 55 | NONE = 0, INT32 = 1, FLOAT32 = 2, FLOAT16 = 3 56 | }; 57 | enum OpType { 58 | NOP = 0, ALLREDUCE = 1, BROADCAST = 2, ACK = 3 59 | }; 60 | struct TensorUpdate { 61 | void* ptr; 62 | uint32_t count; 63 | uint32_t start_idx; 64 | int32_t id; 65 | uint32_t root; 66 | TensorUpdateType type; 67 | OpType op; 68 | uint8_t* bitmap_ptr; 69 | uint32_t block_count; 70 | int32_t devId; 71 | bool async; 72 | bool bitmap_async; 73 | }; 74 | extern volatile bool force_quit; 75 | } 76 | -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/context.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * OmniReduce project 3 | * author: jiawei.fei@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "omnireduce/common.hpp" 9 | 10 | #ifdef USE_CUDA 11 | #include 12 | #endif 13 | 14 | namespace omnireduce { 15 | void *OmniMaster(void *ctx); 16 | 17 | class OmniContext { 18 | public: 19 | static OmniContext& getInstance() { 20 | static OmniContext instance; 21 | return instance; 22 | } 23 | 24 | OmniContext(OmniContext const&) = delete; 25 | void operator=(OmniContext const&) = delete; 26 | void wait_master_ready(); 27 | void set_master_ready(); 28 | void set_num_worker_threads(uint32_t); 29 | uint32_t get_num_worker_threads(); 30 | void set_block_size(uint32_t); 31 | 32 | void receive_result(const int32_t); 33 | bool send_result(const int32_t); 34 | void send_tensor(TensorUpdate*); 35 | bool receive_tensor(TensorUpdate&, uint32_t); 36 | 37 | void init(); 38 | void StartMaster(); 39 | void StopMaster(); 40 | void send_address(int, TensorUpdateType); 41 | 42 | void AllReduce(float*, int, uint8_t*, int); 43 | void AllReduce(int32_t*, int, uint8_t*, int); 44 | #ifdef USE_CUDA 45 | void AllReduce(float*, int, uint8_t*, int, cudaStream_t, int); 46 | void AllReduce(int32_t*, int, uint8_t*, int, cudaStream_t, int); 47 | void AllReduce(float*, int, uint8_t*, int, cudaStream_t, int, bool); 48 | void AllReduce(int32_t*, int, uint8_t*, int, cudaStream_t, int, bool); 49 | void AllReduce_NGDR(float*, int, cudaStream_t, int, bool, bool); 50 | void AllReduce_NGDR(int32_t*, int, cudaStream_t, int, bool, bool); 51 | void AllReduce_GDR(float*, int, cudaStream_t, int); 52 | void AllReduce_GDR(int32_t*, int, cudaStream_t, int); 53 | void AllReduce(float*, int, cudaStream_t, int); 54 | void AllReduce(int32_t*, int, cudaStream_t, int); 55 | void *host_tensor; 56 | uint8_t *bitmap; 57 | #endif 58 | int workerId; 59 | int *socks; 60 | void *comm_buf; 61 | void *cuda_comm_buf; 62 | struct ibv_context *ib_ctx; 63 | struct ibv_port_attr port_attr; 64 | struct ibv_pd *pd; 65 | struct ibv_cq **cq; 66 | struct ibv_qp **qp; 67 | struct ibv_cq *cq_address; 68 | struct ibv_qp **qp_address; 69 | struct ibv_mr *mr; 70 | uint32_t *src_; 71 | struct ibv_mr *mr_; 72 | struct remote_con_data_t *remote_props_array; 73 | std::atomic_uint_fast32_t threadid; 74 | int ret; 75 | 76 | private: 77 | OmniContext(); 78 | virtual ~OmniContext(); 79 | 80 | pthread_t masterThread; 81 | 82 | std::atomic_uint_fast32_t tid_counter; 83 | boost::mutex master_ready_mutex, data_ready_mutex, result_mutex; 84 | boost::condition_variable master_ready_event, data_push_event, data_pop_event, result_push_event, result_pop_event; 85 | uint32_t num_worker_threads; 86 | 87 | uint32_t master_ready; 88 | uint32_t data_ready; 89 | uint32_t results; 90 | TensorUpdate* tensor_update_ptr; 91 | int32_t result_id; 92 | 93 | 94 | boost::chrono::milliseconds one_msec; 95 | boost::chrono::microseconds one_microsec; 96 | }; 97 | } 98 | -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/cuda_utils.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_utils.hpp" 2 | 3 | template 4 | __global__ void bitmap_cuda_kernel(scalar_t* input, uint8_t* bitmap, int64_t len, scalar_t threshold) { 5 | const auto index = blockIdx.x * blockDim.x + threadIdx.x; 6 | __shared__ bool zero_block; 7 | if (threadIdx.x == 0) zero_block = true; 8 | __syncthreads(); 9 | if(index < len) { 10 | if(std::abs(input[index]) > threshold) zero_block=false; 11 | } 12 | __syncthreads(); 13 | if(index < len) { 14 | if(zero_block) { 15 | input[index]=0.0; 16 | bitmap[blockIdx.x]=1; 17 | } 18 | else { 19 | bitmap[blockIdx.x]=0; 20 | } 21 | } 22 | __syncthreads(); 23 | } 24 | 25 | void compute_bitmap(float* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, float threshold) { 26 | uint32_t block_num = tensor_size/block_size; 27 | if (tensor_size%block_size!=0) 28 | block_num += 1; 29 | bitmap_cuda_kernel<<>>(d_tensor, d_bitmap, tensor_size, threshold); 30 | } 31 | 32 | void compute_bitmap(int* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, int threshold) { 33 | uint32_t block_num = tensor_size/block_size; 34 | if (tensor_size%block_size!=0) 35 | block_num += 1; 36 | bitmap_cuda_kernel<<>>(d_tensor, d_bitmap, tensor_size, threshold); 37 | } -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/cuda_utils.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | void compute_bitmap(float* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, float threshold); 6 | void compute_bitmap(int* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, int threshold); -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/omnireduce.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * OmniReduce project 3 | * author: jiawei.fei@kaust.edu.sa 4 | */ 5 | 6 | #pragma once 7 | 8 | #include "omnireduce/context.hpp" 9 | #include "omnireduce/aggcontext.hpp" 10 | 11 | namespace omnireduce { 12 | int master(OmniContext* dctx); 13 | int aggmaster(AggContext* dctx); 14 | } 15 | -------------------------------------------------------------------------------- /omnireduce-RDMA/omnireduce/params.cpp: -------------------------------------------------------------------------------- 1 | #include "omnireduce/params.hpp" 2 | #include 3 | 4 | namespace po = boost::program_options; 5 | 6 | namespace omnireduce { 7 | std::unordered_map qp_num_revert {}; 8 | std::unordered_map qp_num_to_peerid {}; 9 | omnireduce_params omnireduce_par; 10 | 11 | void parse_parameters() 12 | { 13 | std::string config_file; 14 | std::ifstream ifs; 15 | uint32_t num_workers, num_aggregators, num_threads, buffer_size, chunk_size, bitmap_chunk_size, message_size, block_size, direct_memory, adaptive_blocksize, gpu_devId, tcp_port; 16 | int ib_port, gid_idx, sl; 17 | float threshold; 18 | std::string worker_ip_str, aggregator_ips_str, worker_cores, aggregator_cores, ib_hca; 19 | po::options_description omnireduce_options("OmniReduce options"); 20 | po::options_description config_file_options; 21 | omnireduce_options.add_options() 22 | ("omnireduce.num_workers", po::value(&num_workers)->default_value(1), "Number of workers") 23 | ("omnireduce.num_aggregators", po::value(&num_aggregators)->default_value(1), "Number of workers") 24 | ("omnireduce.num_threads", po::value(&num_threads)->default_value(1), "Number of threads") 25 | ("omnireduce.worker_cores", po::value(&worker_cores)->default_value("none"), "core id for each thread") 26 | ("omnireduce.aggregator_cores", po::value(&aggregator_cores)->default_value("none"), "core id for each thread") 27 | ("omnireduce.buffer_size", po::value(&buffer_size)->default_value(1024), "Buffer size(MB)") 28 | ("omnireduce.chunk_size", po::value(&chunk_size)->default_value(4194304), "Chunk size") 29 | ("omnireduce.bitmap_chunk_size", po::value(&bitmap_chunk_size)->default_value(4194304), "Bitmap chunk size") 30 | ("omnireduce.message_size", po::value(&message_size)->default_value(1024), "Message size") 31 | ("omnireduce.block_size", po::value(&block_size)->default_value(1024), "Block size") 32 | ("omnireduce.ib_port", po::value(&ib_port)->default_value(1), "IB port") 33 | ("omnireduce.gid_idx", po::value(&gid_idx)->default_value(2), "GID") 34 | ("omnireduce.sl", po::value(&sl)->default_value(2), "Service level") 35 | ("omnireduce.gpu_devId", po::value(&gpu_devId)->default_value(0), "GPU device ID") 36 | ("omnireduce.direct_memory", po::value(&direct_memory)->default_value(0), "Use direct memory") 37 | ("omnireduce.adaptive_blocksize", po::value(&adaptive_blocksize)->default_value(0), "Use adaptive block size") 38 | ("omnireduce.tcp_port", po::value(&tcp_port)->default_value(19875), "TCP PORT") 39 | ("omnireduce.worker_ips", po::value(&worker_ip_str)->default_value("10.0.0.1"), "Ip addresses of workers") 40 | ("omnireduce.aggregator_ips", po::value(&aggregator_ips_str)->default_value("10.0.0.1"), "Ip addresses of aggregators") 41 | ("omnireduce.threshold", po::value(&threshold)->default_value(0.0), "Threshold for bitmap calculation") 42 | ("omnireduce.ib_hca", po::value(&ib_hca)->default_value("mlx5_0"), "eth name"); 43 | config_file_options.add(omnireduce_options); 44 | config_file = "/etc/omnireduce.cfg"; 45 | ifs.open(config_file.c_str()); 46 | if(!ifs.good()){ 47 | ifs.close(); 48 | config_file = "omnireduce.cfg"; 49 | ifs.open(config_file.c_str()); 50 | if(!ifs.good()){ 51 | ifs.close(); 52 | std::cerr<<"No config file found!"< 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define MAX_NUM_QPS 2 17 | #define MAX_NUM_THREADS 8 18 | #define MAX_NUM_AGGS 8 19 | #define MAX_CONCURRENT_WRITES 4096 20 | #define QUEUE_DEPTH_DEFAULT 4096 21 | #define QPNUM_FACTOR 8 22 | 23 | namespace omnireduce { 24 | void parse_parameters(); 25 | extern std::unordered_map qp_num_revert; 26 | extern std::unordered_map qp_num_to_peerid; 27 | 28 | class omnireduce_params { 29 | private: 30 | uint32_t buff_unit_size; 31 | uint32_t num_worker_threads; 32 | uint32_t num_workers; 33 | uint32_t num_aggregators; 34 | uint32_t num_qps_per_aggregator_per_thread; 35 | uint32_t num_slots_per_thread; 36 | uint32_t buffer_size; 37 | uint32_t chunk_size; 38 | uint32_t bitmap_chunk_size; 39 | uint32_t message_size; 40 | uint32_t block_size; 41 | uint32_t num_comm_buff; 42 | uint32_t prepost_recv_num; 43 | uint32_t *inf_offset; 44 | uint32_t direct_memory; 45 | uint32_t adaptive_blocksize; 46 | uint32_t gpu_devId; 47 | uint32_t tcp_port; 48 | float threshold; 49 | char *ib_hca; 50 | int ib_port; 51 | int gid_idx; 52 | int sl; 53 | char **aggregator_ipaddr; 54 | char **worker_ipaddr; 55 | int *worker_cores; 56 | int *aggregator_cores; 57 | public: 58 | omnireduce_params(); 59 | ~omnireduce_params(); 60 | void setIbPort(int p) { 61 | ib_port=p; 62 | } 63 | void setGidIdx(int g) { 64 | gid_idx=g; 65 | } 66 | void setServiceLevel(int s) { 67 | sl=s; 68 | } 69 | void setInfOffset(uint32_t num_blocks_per_thread) { 70 | inf_offset = (uint32_t *)malloc(num_blocks_per_thread*sizeof(uint32_t)); 71 | for (uint32_t i=0; i ips; 112 | boost::split(ips, workerIps, boost::is_any_of(",")); 113 | if (num_workers!=ips.size()) 114 | { 115 | std::cerr<<"Worker number error!"< ips; 127 | boost::split(ips, aggregatorIps, boost::is_any_of(",")); 128 | if (num_aggregators!=ips.size()) 129 | { 130 | std::cerr<<"Aggregator number error!"< coreids; 142 | boost::split(coreids, cores_str, boost::is_any_of(",")); 143 | if(num_worker_threads!=coreids.size()) 144 | { 145 | std::cerr<<"core id set error!"< coreids; 156 | boost::split(coreids, cores_str, boost::is_any_of(",")); 157 | if(num_worker_threads!=coreids.size()) 158 | { 159 | std::cerr<<"core id set error!"<